2024-02-04 16:02:50 +01:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"# Business Data Challenge - Team 1"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 1,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import os\n",
|
|
|
|
"import s3fs\n",
|
|
|
|
"import re"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Configuration de l'accès aux données"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 2,
|
|
|
|
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# Create filesystem object\n",
|
|
|
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
|
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"# Exemple sur Company 1"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Chargement données"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"BUCKET = \"bdc2324-data/1\"\n",
|
|
|
|
"liste_database = fs.ls(BUCKET)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 4,
|
|
|
|
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
" df = pd.read_csv(file_in)\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
|
|
|
"# loop to create dataframes from liste\n",
|
|
|
|
"files_path = liste_database\n",
|
|
|
|
"\n",
|
|
|
|
"client_number = files_path[0].split(\"/\")[1]\n",
|
|
|
|
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
|
|
|
"\n",
|
|
|
|
"for i in range(len(files_path)) :\n",
|
|
|
|
" current_path = files_path[i]\n",
|
|
|
|
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
|
|
|
" df = pd.read_csv(file_in)\n",
|
|
|
|
" # the pattern of the name is df1xxx\n",
|
|
|
|
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
|
|
|
" globals()[nom_dataframe] = df"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Cleaning functions"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 5,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def cleaning_date(df, column_name):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
|
|
|
"\n",
|
|
|
|
" Parameters:\n",
|
|
|
|
" - df: DataFrame\n",
|
|
|
|
" Le DataFrame contenant la colonne à nettoyer.\n",
|
|
|
|
" - column_name: str\n",
|
|
|
|
" Le nom de la colonne à nettoyer.\n",
|
|
|
|
"\n",
|
|
|
|
" Returns:\n",
|
|
|
|
" - DataFrame\n",
|
|
|
|
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
|
|
|
" return df"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Preprocessing"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## customer_plus"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 6,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
2024-02-05 22:03:49 +01:00
|
|
|
"source": [
|
|
|
|
"def preprocessing_customerplus(customerplus = None):\n",
|
|
|
|
"\n",
|
|
|
|
" customerplus_copy = customerplus.copy()\n",
|
|
|
|
" \n",
|
|
|
|
" # Passage en format date\n",
|
|
|
|
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
|
|
|
|
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
|
|
|
|
" \n",
|
|
|
|
" # Selection des variables\n",
|
|
|
|
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
|
|
|
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
|
|
|
"\n",
|
|
|
|
" return customerplus_copy\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 7,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
|
|
|
|
]
|
2024-02-04 16:02:50 +01:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Ticket area"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 8,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# Fonction de nettoyage et selection\n",
|
|
|
|
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
|
|
|
" # Base des tickets\n",
|
|
|
|
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
|
|
|
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
|
|
|
"\n",
|
|
|
|
" # Base des fournisseurs\n",
|
|
|
|
" suppliers = suppliers[['id', 'name']]\n",
|
|
|
|
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
|
|
|
"\n",
|
|
|
|
" # Base des types de billets\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
|
|
|
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
|
|
|
" # Base des achats\n",
|
|
|
|
" # Nettoyage de la date d'achat\n",
|
|
|
|
" cleaning_date(purchases, 'purchase_date')\n",
|
|
|
|
" # Selection des variables\n",
|
|
|
|
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
|
|
|
"\n",
|
|
|
|
" # Fusions \n",
|
|
|
|
" # Fusion avec fournisseurs\n",
|
|
|
|
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
|
|
|
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
|
|
|
" \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
" # Fusion avec type de tickets\n",
|
|
|
|
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
|
|
|
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" \n",
|
|
|
|
" # Fusion avec achats\n",
|
|
|
|
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
|
|
|
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
|
|
|
|
"\n",
|
|
|
|
" return ticket_information"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 9,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1591303091.py:5: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1591303091.py:9: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1591303091.py:13: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
|
|
|
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 10,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>ticket_id</th>\n",
|
|
|
|
" <th>product_id</th>\n",
|
|
|
|
" <th>is_from_subscription</th>\n",
|
|
|
|
" <th>supplier_name</th>\n",
|
|
|
|
" <th>type_of_ticket_name</th>\n",
|
|
|
|
" <th>children</th>\n",
|
|
|
|
" <th>purchase_date</th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>13070859</td>\n",
|
|
|
|
" <td>225251</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>13070860</td>\n",
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>13070861</td>\n",
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>13070862</td>\n",
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>13070863</td>\n",
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
|
|
|
"0 13070859 225251 False vente en ligne \n",
|
|
|
|
"1 13070860 224914 False vente en ligne \n",
|
|
|
|
"2 13070861 224914 False vente en ligne \n",
|
|
|
|
"3 13070862 224914 False vente en ligne \n",
|
|
|
|
"4 13070863 224914 False vente en ligne \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
" type_of_ticket_name children purchase_date customer_id \n",
|
|
|
|
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 10,
|
2024-02-05 22:03:49 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_ticket_information.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "37499eae-1a7f-4dce-83b0-ff942ccf7a9d",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"### KPI tickets"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 11,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def tickets_kpi_function(tickets_information = None):\n",
|
|
|
|
" tickets_information_copy = tickets_information.copy()\n",
|
|
|
|
" tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n",
|
|
|
|
" tickets_kpi = (tickets_information_copy[['product_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max']]\n",
|
|
|
|
" .groupby(['product_id', 'customer_id'])\n",
|
|
|
|
" .agg({'ticket_id': 'count', \n",
|
|
|
|
" 'supplier_name': 'nunique',\n",
|
|
|
|
" 'purchase_date_max' : 'max',\n",
|
|
|
|
" 'purchase_date' : 'min'})\n",
|
|
|
|
" .reset_index()\n",
|
|
|
|
" )\n",
|
|
|
|
" \n",
|
|
|
|
" tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n",
|
|
|
|
" 'supplier_name' : 'nb_suppliers', \n",
|
|
|
|
" 'purchase_date' : 'purchase_date_min'}, inplace = True)\n",
|
|
|
|
" \n",
|
|
|
|
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
|
|
|
|
" \n",
|
|
|
|
" return tickets_kpi\n",
|
|
|
|
" "
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 12,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_ticket_information)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 13,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>product_id</th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" <th>nb_tickets</th>\n",
|
|
|
|
" <th>nb_suppliers</th>\n",
|
|
|
|
" <th>purchase_date_max</th>\n",
|
|
|
|
" <th>purchase_date_min</th>\n",
|
|
|
|
" <th>time_between_purchase</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>107310</td>\n",
|
|
|
|
" <td>2805</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2019-06-05 14:37:13+00:00</td>\n",
|
|
|
|
" <td>2019-06-05 14:18:38+00:00</td>\n",
|
|
|
|
" <td>0 days 00:18:35</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>110089</td>\n",
|
|
|
|
" <td>54355</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2017-02-17 13:32:51+00:00</td>\n",
|
|
|
|
" <td>2017-02-17 13:32:51+00:00</td>\n",
|
|
|
|
" <td>0 days 00:00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>110089</td>\n",
|
|
|
|
" <td>54356</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2017-03-02 14:36:16+00:00</td>\n",
|
|
|
|
" <td>2017-03-02 14:36:16+00:00</td>\n",
|
|
|
|
" <td>0 days 00:00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>110089</td>\n",
|
|
|
|
" <td>54357</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2017-03-06 15:16:41+00:00</td>\n",
|
|
|
|
" <td>2017-03-06 15:16:41+00:00</td>\n",
|
|
|
|
" <td>0 days 00:00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>110089</td>\n",
|
|
|
|
" <td>54358</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2017-03-13 16:07:27+00:00</td>\n",
|
|
|
|
" <td>2017-03-13 16:07:27+00:00</td>\n",
|
|
|
|
" <td>0 days 00:00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" product_id customer_id nb_tickets nb_suppliers \\\n",
|
|
|
|
"0 107310 2805 4 2 \n",
|
|
|
|
"1 110089 54355 1 1 \n",
|
|
|
|
"2 110089 54356 1 1 \n",
|
|
|
|
"3 110089 54357 1 1 \n",
|
|
|
|
"4 110089 54358 1 1 \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
" purchase_date_max purchase_date_min time_between_purchase \n",
|
|
|
|
"0 2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00 0 days 00:18:35 \n",
|
|
|
|
"1 2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00 0 days 00:00:00 \n",
|
|
|
|
"2 2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00 0 days 00:00:00 \n",
|
|
|
|
"3 2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00 0 days 00:00:00 \n",
|
|
|
|
"4 2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00 0 days 00:00:00 "
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 13,
|
2024-02-05 22:03:49 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_tickets_kpi.head()"
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-04 16:02:50 +01:00
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Target area"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 14,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
|
|
|
|
" # Target.csv cleaning\n",
|
|
|
|
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
|
|
|
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
|
|
|
" \n",
|
|
|
|
" # target_type cleaning\n",
|
|
|
|
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
|
|
|
" \n",
|
|
|
|
" #customer_target_mappings cleaning\n",
|
|
|
|
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
|
|
|
" \n",
|
|
|
|
" # Merge target et target_type\n",
|
|
|
|
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
|
|
|
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
|
|
|
" \n",
|
|
|
|
" # Merge\n",
|
|
|
|
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
|
|
|
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
|
|
|
|
"\n",
|
|
|
|
" return targets_full"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 15,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/3848597476.py:4: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
|
|
|
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 16,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>target_name</th>\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>consentement optin mediation specialisee</th>\n",
|
|
|
|
" <td>150000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>consentement optin jeune public</th>\n",
|
|
|
|
" <td>149979</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>consentement optin b2c</th>\n",
|
|
|
|
" <td>108909</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>Arenametrix_bascule tel vers sib</th>\n",
|
|
|
|
" <td>35216</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>consentement optout b2c</th>\n",
|
|
|
|
" <td>34523</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" customer_id\n",
|
|
|
|
"target_name \n",
|
|
|
|
"consentement optin mediation specialisee 150000\n",
|
|
|
|
"consentement optin jeune public 149979\n",
|
|
|
|
"consentement optin b2c 108909\n",
|
|
|
|
"Arenametrix_bascule tel vers sib 35216\n",
|
|
|
|
"consentement optout b2c 34523"
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 16,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 17,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>target_name</th>\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>Arenametrix_bascule tel vers sib</th>\n",
|
|
|
|
" <td>35216</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>Autres_interet_exposition</th>\n",
|
|
|
|
" <td>1021</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>COM Inscrits NL générale (historique)</th>\n",
|
|
|
|
" <td>23005</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>Contacts_prenomsdoubles</th>\n",
|
|
|
|
" <td>11643</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>DDCP MD Procès du Siècle</th>\n",
|
|
|
|
" <td>1684</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" customer_id\n",
|
|
|
|
"target_name \n",
|
|
|
|
"Arenametrix_bascule tel vers sib 35216\n",
|
|
|
|
"Autres_interet_exposition 1021\n",
|
|
|
|
"COM Inscrits NL générale (historique) 23005\n",
|
|
|
|
"Contacts_prenomsdoubles 11643\n",
|
|
|
|
"DDCP MD Procès du Siècle 1684"
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 17,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
"source": [
|
|
|
|
"df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Campaings area"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 18,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
|
|
|
|
" # campaign_stats cleaning \n",
|
|
|
|
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
|
|
|
" cleaning_date(campaign_stats, 'opened_at')\n",
|
|
|
|
" cleaning_date(campaign_stats, 'sent_at')\n",
|
|
|
|
" cleaning_date(campaign_stats, 'delivered_at')\n",
|
|
|
|
" \n",
|
|
|
|
" # campaigns cleaning\n",
|
|
|
|
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
|
|
|
" cleaning_date(campaigns, 'campaign_sent_at')\n",
|
|
|
|
" \n",
|
|
|
|
" # Merge \n",
|
|
|
|
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
|
|
|
|
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
|
|
|
|
"\n",
|
|
|
|
" return campaigns_full"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 19,
|
2024-02-04 16:02:50 +01:00
|
|
|
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2024-02-05 12:51:35 +01:00
|
|
|
"source": [
|
|
|
|
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 20,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
|
2024-02-05 22:03:49 +01:00
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>id</th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" <th>opened_at</th>\n",
|
|
|
|
" <th>sent_at</th>\n",
|
|
|
|
" <th>delivered_at</th>\n",
|
|
|
|
" <th>campaign_name</th>\n",
|
|
|
|
" <th>campaign_service_id</th>\n",
|
|
|
|
" <th>campaign_sent_at</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>19793</td>\n",
|
|
|
|
" <td>112597</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:24:18+00:00</td>\n",
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
" <td>404</td>\n",
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>14211</td>\n",
|
|
|
|
" <td>113666</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:21:02+00:00</td>\n",
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
" <td>404</td>\n",
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>13150</td>\n",
|
|
|
|
" <td>280561</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:08:45+00:00</td>\n",
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
" <td>404</td>\n",
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>7073</td>\n",
|
|
|
|
" <td>101007</td>\n",
|
|
|
|
" <td>2021-03-28 18:11:06+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:09:47+00:00</td>\n",
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
" <td>404</td>\n",
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>5175</td>\n",
|
|
|
|
" <td>103972</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" <td>2021-03-28 16:01:06+00:00</td>\n",
|
|
|
|
" <td>2021-03-28 16:05:03+00:00</td>\n",
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
" <td>404</td>\n",
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" id customer_id opened_at sent_at \\\n",
|
|
|
|
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
|
|
|
|
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
|
|
|
|
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
|
|
|
|
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
|
|
|
|
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
" delivered_at campaign_name \\\n",
|
|
|
|
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
" campaign_service_id campaign_sent_at \n",
|
|
|
|
"0 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
"1 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
"2 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
"3 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
"4 404 2021-03-27 23:00:00+00:00 "
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 20,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2024-02-05 12:51:35 +01:00
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_campaigns_information.head()"
|
2024-02-05 12:51:35 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 21,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2024-02-05 22:03:49 +01:00
|
|
|
"def campaigns_kpi_function(campaigns_information = None):\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" # Nombre de campagnes de mails\n",
|
|
|
|
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
|
|
|
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
|
|
|
|
" # Temps d'ouverture en min moyen \n",
|
|
|
|
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
|
|
|
|
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
|
|
|
|
"\n",
|
|
|
|
" # Nombre de mail ouvert \n",
|
|
|
|
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
|
|
|
|
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
|
|
|
|
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
|
|
|
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
|
|
|
|
"\n",
|
|
|
|
" # Fusion des indicateurs\n",
|
|
|
|
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
|
|
|
|
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
|
|
|
|
"\n",
|
|
|
|
" # Remplir les NaN : nb_campaigns_opened\n",
|
|
|
|
" campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
|
|
|
|
"\n",
|
|
|
|
" # Remplir les NaT : time_to_open (??)\n",
|
|
|
|
"\n",
|
|
|
|
" return campaigns_reduced\n",
|
|
|
|
" "
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 22,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"/tmp/ipykernel_3658/3700263836.py:11: SettingWithCopyWarning: \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
2024-02-05 12:51:35 +01:00
|
|
|
"source": [
|
2024-02-05 22:03:49 +01:00
|
|
|
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
|
2024-02-05 12:51:35 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 23,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
" <th>nb_campaigns</th>\n",
|
|
|
|
" <th>nb_campaigns_opened</th>\n",
|
|
|
|
" <th>time_to_open</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>222</td>\n",
|
|
|
|
" <td>124.0</td>\n",
|
|
|
|
" <td>1 days 00:28:30.169354838</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>7</td>\n",
|
|
|
|
" <td>7.0</td>\n",
|
|
|
|
" <td>1 days 04:31:01.428571428</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>5</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>6</td>\n",
|
|
|
|
" <td>20</td>\n",
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
|
|
|
|
"0 2 4 0.0 NaT\n",
|
|
|
|
"1 3 222 124.0 1 days 00:28:30.169354838\n",
|
|
|
|
"2 4 7 7.0 1 days 04:31:01.428571428\n",
|
|
|
|
"3 5 4 0.0 NaT\n",
|
|
|
|
"4 6 20 0.0 NaT"
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 23,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2024-02-05 12:51:35 +01:00
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_campaigns_kpi.head()"
|
2024-02-05 12:51:35 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Create Products Table"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Some useful functions"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 24,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"BUCKET = \"bdc2324-data\"\n",
|
|
|
|
"directory_path = '1'"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 25,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def display_databases(file_name):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" This function returns the file from s3 storage\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
|
|
|
|
" print(\"File path : \", file_path)\n",
|
|
|
|
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
|
|
|
|
" df = pd.read_csv(file_in, sep=\",\")\n",
|
|
|
|
" \n",
|
|
|
|
" print(\"Shape : \", df.shape)\n",
|
|
|
|
" return df\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"def remove_horodates(df):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" this function remove horodate columns like created_at and updated_at\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
|
|
|
|
" return df\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"def order_columns_id(df):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" this function puts all id columns at the beginning in order to read the dataset easier\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" substring = 'id'\n",
|
|
|
|
" id_columns = [col for col in df.columns if substring in col]\n",
|
|
|
|
" remaining_col = [col for col in df.columns if substring not in col]\n",
|
|
|
|
" new_order = id_columns + remaining_col\n",
|
|
|
|
" return df[new_order]\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"def process_df_2(df):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" This function organizes dataframe\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" df = remove_horodates(df)\n",
|
|
|
|
" print(\"Number of columns : \", len(df.columns))\n",
|
|
|
|
" df = order_columns_id(df)\n",
|
|
|
|
" print(\"Columns : \", df.columns)\n",
|
|
|
|
" return df\n",
|
|
|
|
"\n",
|
|
|
|
"def load_dataset(name):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" This function loads csv file\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" df = display_databases(name)\n",
|
|
|
|
" df = process_df_2(df)\n",
|
|
|
|
" # drop na :\n",
|
|
|
|
" #df = df.dropna(axis=1, thresh=len(df))\n",
|
|
|
|
" # if identifier in table : delete it\n",
|
|
|
|
" if 'identifier' in df.columns:\n",
|
|
|
|
" df = df.drop(columns = 'identifier')\n",
|
|
|
|
" return df"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Create theme tables"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 26,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def create_products_table():\n",
|
|
|
|
" # first merge products and categories\n",
|
|
|
|
" print(\"first merge products and categories\")\n",
|
|
|
|
" products = load_dataset(\"1products.csv\")\n",
|
|
|
|
" categories = load_dataset(\"1categories.csv\")\n",
|
|
|
|
" # Drop useless columns\n",
|
|
|
|
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
|
|
|
|
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
|
|
|
|
"\n",
|
|
|
|
" #Merge\n",
|
|
|
|
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
|
|
|
|
" right_on = 'id', suffixes=('_products', '_categories'))\n",
|
|
|
|
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
|
|
|
|
" \n",
|
|
|
|
" # Second merge products_theme and type of categories\n",
|
|
|
|
" print(\"Second merge products_theme and type of categories\")\n",
|
|
|
|
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
|
|
|
|
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
|
|
|
|
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
|
|
|
|
" right_on = 'category_id' )\n",
|
|
|
|
"\n",
|
|
|
|
" # Index cleaning\n",
|
|
|
|
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
|
|
|
|
" products_theme = order_columns_id(products_theme)\n",
|
|
|
|
" return products_theme\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"def create_events_table():\n",
|
|
|
|
" # first merge events and seasons : \n",
|
|
|
|
" print(\"first merge events and seasons : \")\n",
|
|
|
|
" events = load_dataset(\"1events.csv\")\n",
|
|
|
|
" seasons = load_dataset(\"1seasons.csv\")\n",
|
|
|
|
"\n",
|
|
|
|
" # Drop useless columns\n",
|
|
|
|
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
|
|
|
|
" seasons = seasons.drop(columns = ['start_date_time'])\n",
|
|
|
|
" \n",
|
|
|
|
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
|
|
|
|
"\n",
|
|
|
|
" # Secondly merge events_theme and event_types\n",
|
|
|
|
" print(\"Secondly merge events_theme and event_types : \")\n",
|
|
|
|
" event_types = load_dataset(\"1event_types.csv\")\n",
|
|
|
|
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
|
|
|
|
" \n",
|
|
|
|
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
|
|
|
|
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
|
|
|
|
" events_theme = events_theme.drop(columns = 'id')\n",
|
|
|
|
"\n",
|
|
|
|
" # thirdly merge events_theme and facilities\n",
|
|
|
|
" print(\"thirdly merge events_theme and facilities : \")\n",
|
|
|
|
" facilities = load_dataset(\"1facilities.csv\")\n",
|
|
|
|
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
|
|
|
|
" \n",
|
|
|
|
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
|
|
|
|
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
|
|
|
|
" events_theme = events_theme.drop(columns = 'id')\n",
|
|
|
|
"\n",
|
|
|
|
" # Index cleaning\n",
|
|
|
|
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
|
|
|
|
" events_theme = order_columns_id(events_theme)\n",
|
|
|
|
" return events_theme\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"def create_representations_table():\n",
|
|
|
|
" representations = load_dataset(\"1representations.csv\")\n",
|
|
|
|
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
|
|
|
|
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
|
|
|
|
" 'representation_type_id'])\n",
|
|
|
|
" \n",
|
|
|
|
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
|
|
|
|
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
|
|
|
|
"\n",
|
|
|
|
" representations_theme = representations.merge(representations_capacity, how='left',\n",
|
|
|
|
" left_on='id', right_on='representation_id',\n",
|
|
|
|
" suffixes=('_representation', '_representation_cap'))\n",
|
|
|
|
" # index cleaning\n",
|
|
|
|
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
|
|
|
|
" representations_theme = order_columns_id(representations_theme)\n",
|
|
|
|
" return representations_theme"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 27,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
|
|
|
|
"metadata": {},
|
2024-02-04 16:02:50 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
2024-02-05 12:51:35 +01:00
|
|
|
"name": "stdout",
|
2024-02-04 16:02:50 +01:00
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"first merge products and categories\n",
|
|
|
|
"File path : bdc2324-data/1/1products.csv\n",
|
|
|
|
"Shape : (94803, 14)\n",
|
|
|
|
"Number of columns : 12\n",
|
|
|
|
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
|
|
|
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
|
|
|
|
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
|
|
|
|
" dtype='object')\n",
|
|
|
|
"File path : bdc2324-data/1/1categories.csv\n",
|
|
|
|
"Shape : (27, 7)\n",
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
|
|
|
|
"Second merge products_theme and type of categories\n",
|
|
|
|
"File path : bdc2324-data/1/1type_of_categories.csv\n",
|
|
|
|
"Shape : (5, 6)\n",
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>id_products</th>\n",
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
" <th>pricing_formula_id</th>\n",
|
|
|
|
" <th>category_id</th>\n",
|
|
|
|
" <th>products_group_id</th>\n",
|
|
|
|
" <th>product_pack_id</th>\n",
|
|
|
|
" <th>type_of_id</th>\n",
|
|
|
|
" <th>amount</th>\n",
|
|
|
|
" <th>is_full_price</th>\n",
|
|
|
|
" <th>name_categories</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>10682</td>\n",
|
|
|
|
" <td>914</td>\n",
|
|
|
|
" <td>114</td>\n",
|
|
|
|
" <td>41</td>\n",
|
|
|
|
" <td>10655</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>indiv activité tr</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>478</td>\n",
|
|
|
|
" <td>273</td>\n",
|
|
|
|
" <td>131</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>471</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
" <td>9.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>20873</td>\n",
|
|
|
|
" <td>275</td>\n",
|
|
|
|
" <td>137</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>20825</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
" <td>11.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>157142</td>\n",
|
|
|
|
" <td>82519</td>\n",
|
|
|
|
" <td>9</td>\n",
|
|
|
|
" <td>5</td>\n",
|
|
|
|
" <td>156773</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>indiv entrées tr</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>1341</td>\n",
|
|
|
|
" <td>9</td>\n",
|
|
|
|
" <td>93</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1175</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
" <td>8.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
" id_products representation_id pricing_formula_id category_id \\\n",
|
|
|
|
"0 10682 914 114 41 \n",
|
|
|
|
"1 478 273 131 1 \n",
|
|
|
|
"2 20873 275 137 1 \n",
|
|
|
|
"3 157142 82519 9 5 \n",
|
|
|
|
"4 1341 9 93 1 \n",
|
|
|
|
"\n",
|
|
|
|
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
|
|
|
|
"0 10655 1 NaN 9.0 False \n",
|
|
|
|
"1 471 1 12.0 9.5 False \n",
|
|
|
|
"2 20825 1 12.0 11.5 False \n",
|
|
|
|
"3 156773 1 NaN 8.0 False \n",
|
|
|
|
"4 1175 1 12.0 8.5 False \n",
|
|
|
|
"\n",
|
|
|
|
" name_categories \n",
|
|
|
|
"0 indiv activité tr \n",
|
|
|
|
"1 indiv entrées tp \n",
|
|
|
|
"2 indiv entrées tp \n",
|
|
|
|
"3 indiv entrées tr \n",
|
|
|
|
"4 indiv entrées tp "
|
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 27,
|
2024-02-05 12:51:35 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
2024-02-04 16:02:50 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"products_theme = create_products_table()\n",
|
|
|
|
"products_theme.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 28,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
|
2024-02-04 16:02:50 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
2024-02-05 12:51:35 +01:00
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"first merge events and seasons : \n",
|
|
|
|
"File path : bdc2324-data/1/1events.csv\n",
|
|
|
|
"Shape : (1232, 12)\n",
|
|
|
|
"Number of columns : 10\n",
|
|
|
|
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
|
|
|
|
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
|
|
|
|
" dtype='object')\n",
|
|
|
|
"File path : bdc2324-data/1/1seasons.csv\n",
|
|
|
|
"Shape : (13, 6)\n",
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
|
|
|
|
"Secondly merge events_theme and event_types : \n",
|
|
|
|
"File path : bdc2324-data/1/1event_types.csv\n",
|
|
|
|
"Shape : (9, 6)\n",
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
|
|
|
|
"thirdly merge events_theme and facilities : \n",
|
|
|
|
"File path : bdc2324-data/1/1facilities.csv\n",
|
|
|
|
"Shape : (2, 7)\n",
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
|
|
|
|
]
|
|
|
|
},
|
2024-02-04 16:02:50 +01:00
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>event_id</th>\n",
|
|
|
|
" <th>season_id</th>\n",
|
|
|
|
" <th>facility_id</th>\n",
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
" <th>event_type_key_id</th>\n",
|
|
|
|
" <th>facility_key_id</th>\n",
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
" <th>name_events</th>\n",
|
|
|
|
" <th>name_seasons</th>\n",
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
" <th>name_facilities</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>192</td>\n",
|
|
|
|
" <td>16</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>frontières</td>\n",
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>30329</td>\n",
|
|
|
|
" <td>2767</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>5</td>\n",
|
|
|
|
" <td>5</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>161</td>\n",
|
|
|
|
" <td>16</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>visite contée les chercheurs d'or indiv</td>\n",
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>5957</td>\n",
|
|
|
|
" <td>582</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
|
|
|
|
" <td>2021</td>\n",
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>8337</td>\n",
|
|
|
|
" <td>582</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>jeff koons épisodes 4</td>\n",
|
|
|
|
" <td>2021</td>\n",
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
|
|
|
|
"0 192 16 1 4 4 \n",
|
|
|
|
"1 30329 2767 1 5 5 \n",
|
|
|
|
"2 161 16 1 2 2 \n",
|
|
|
|
"3 5957 582 1 4 4 \n",
|
|
|
|
"4 8337 582 1 4 4 \n",
|
|
|
|
"\n",
|
|
|
|
" facility_key_id street_id \\\n",
|
|
|
|
"0 1 1 \n",
|
|
|
|
"1 1 1 \n",
|
|
|
|
"2 1 1 \n",
|
|
|
|
"3 1 1 \n",
|
|
|
|
"4 1 1 \n",
|
|
|
|
"\n",
|
|
|
|
" name_events name_seasons \\\n",
|
|
|
|
"0 frontières 2018 \n",
|
|
|
|
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
|
|
|
|
"2 visite contée les chercheurs d'or indiv 2018 \n",
|
|
|
|
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
|
|
|
|
"4 jeff koons épisodes 4 2021 \n",
|
|
|
|
"\n",
|
|
|
|
" name_event_types name_facilities \n",
|
|
|
|
"0 spectacle vivant mucem \n",
|
|
|
|
"1 offre muséale groupe mucem \n",
|
|
|
|
"2 offre muséale individuel mucem \n",
|
|
|
|
"3 spectacle vivant mucem \n",
|
|
|
|
"4 spectacle vivant mucem "
|
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 28,
|
2024-02-05 12:51:35 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"events_theme= create_events_table()\n",
|
|
|
|
"events_theme.head()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 29,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"File path : bdc2324-data/1/1representations.csv\n",
|
|
|
|
"Shape : (36095, 16)\n",
|
|
|
|
"Number of columns : 14\n",
|
|
|
|
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
|
|
|
|
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
|
|
|
|
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
|
|
|
|
" dtype='object')\n",
|
|
|
|
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
|
|
|
|
"Shape : (65241, 7)\n",
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
|
|
|
|
" 'max_filling'],\n",
|
|
|
|
" dtype='object')\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>event_id</th>\n",
|
|
|
|
" <th>id_representation_cap</th>\n",
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
" <th>category_id</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>0</th>\n",
|
|
|
|
" <td>12384</td>\n",
|
|
|
|
" <td>123058</td>\n",
|
|
|
|
" <td>84820</td>\n",
|
|
|
|
" <td>2</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>1</th>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>2514</td>\n",
|
|
|
|
" <td>269</td>\n",
|
|
|
|
" <td>2</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>2</th>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>384</td>\n",
|
|
|
|
" <td>269</td>\n",
|
|
|
|
" <td>5</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>3</th>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>2515</td>\n",
|
|
|
|
" <td>269</td>\n",
|
|
|
|
" <td>10</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>4</th>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>383</td>\n",
|
|
|
|
" <td>269</td>\n",
|
|
|
|
" <td>1</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 12:51:35 +01:00
|
|
|
" event_id id_representation_cap representation_id category_id\n",
|
|
|
|
"0 12384 123058 84820 2\n",
|
|
|
|
"1 37 2514 269 2\n",
|
|
|
|
"2 37 384 269 5\n",
|
|
|
|
"3 37 2515 269 10\n",
|
|
|
|
"4 37 383 269 1"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 29,
|
2024-02-04 16:02:50 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"representation_theme = create_representations_table()\n",
|
|
|
|
"representation_theme.head()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Create uniform product database "
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 30,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
|
2024-02-04 16:02:50 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"def uniform_product_df():\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" This function returns the uniform product dataset\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" print(\"Products theme columns : \", products_theme.columns)\n",
|
|
|
|
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
|
|
|
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" products_global = products_theme.merge(representation_theme, how='left',\n",
|
|
|
|
" on= [\"representation_id\", \"category_id\"])\n",
|
|
|
|
" \n",
|
|
|
|
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
|
|
|
|
" suffixes = (\"_representation\", \"_event\"))\n",
|
|
|
|
" \n",
|
|
|
|
" products_global = order_columns_id(products_global)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" # remove useless columns \n",
|
|
|
|
" products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n",
|
|
|
|
" return products_global"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 31,
|
2024-02-05 12:51:35 +01:00
|
|
|
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
|
2024-02-04 16:02:50 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
2024-02-05 12:51:35 +01:00
|
|
|
"name": "stdout",
|
2024-02-04 16:02:50 +01:00
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
|
|
|
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
|
|
|
|
" 'is_full_price', 'name_categories'],\n",
|
|
|
|
" dtype='object')\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
|
|
|
|
" 'category_id'],\n",
|
|
|
|
" dtype='object')\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
|
|
|
|
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
|
|
|
|
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
|
|
|
|
" dtype='object')\n"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
},
|
2024-02-04 16:02:50 +01:00
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <th>id_products</th>\n",
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
" <th>pricing_formula_id</th>\n",
|
|
|
|
" <th>category_id</th>\n",
|
|
|
|
" <th>products_group_id</th>\n",
|
|
|
|
" <th>product_pack_id</th>\n",
|
|
|
|
" <th>event_id</th>\n",
|
|
|
|
" <th>id_representation_cap</th>\n",
|
|
|
|
" <th>season_id</th>\n",
|
|
|
|
" <th>facility_id</th>\n",
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
" <th>event_type_key_id</th>\n",
|
|
|
|
" <th>facility_key_id</th>\n",
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
" <th>amount</th>\n",
|
|
|
|
" <th>is_full_price</th>\n",
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
" <th>name_facilities</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>10682</td>\n",
|
|
|
|
" <td>914</td>\n",
|
|
|
|
" <td>114</td>\n",
|
|
|
|
" <td>41</td>\n",
|
|
|
|
" <td>10655</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>132</td>\n",
|
|
|
|
" <td>8789</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>4</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>5</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>478</td>\n",
|
|
|
|
" <td>273</td>\n",
|
|
|
|
" <td>131</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>471</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>390</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>9.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>20873</td>\n",
|
|
|
|
" <td>275</td>\n",
|
|
|
|
" <td>137</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>20825</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>37</td>\n",
|
|
|
|
" <td>395</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>11.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>157142</td>\n",
|
|
|
|
" <td>82519</td>\n",
|
|
|
|
" <td>9</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>5</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>156773</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>12365</td>\n",
|
|
|
|
" <td>120199</td>\n",
|
|
|
|
" <td>1754</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>2</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>4</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>1341</td>\n",
|
|
|
|
" <td>9</td>\n",
|
|
|
|
" <td>93</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>1175</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>8</td>\n",
|
|
|
|
" <td>21</td>\n",
|
|
|
|
" <td>4</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>6</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" <td>8.5</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>non défini</td>\n",
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2024-02-05 12:51:35 +01:00
|
|
|
" id_products representation_id pricing_formula_id category_id \\\n",
|
|
|
|
"0 10682 914 114 41 \n",
|
|
|
|
"1 478 273 131 1 \n",
|
|
|
|
"2 20873 275 137 1 \n",
|
|
|
|
"3 157142 82519 9 5 \n",
|
|
|
|
"4 1341 9 93 1 \n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" products_group_id product_pack_id event_id id_representation_cap \\\n",
|
|
|
|
"0 10655 1 132 8789 \n",
|
|
|
|
"1 471 1 37 390 \n",
|
|
|
|
"2 20825 1 37 395 \n",
|
|
|
|
"3 156773 1 12365 120199 \n",
|
|
|
|
"4 1175 1 8 21 \n",
|
2024-02-04 16:02:50 +01:00
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
" season_id facility_id event_type_id event_type_key_id facility_key_id \\\n",
|
|
|
|
"0 4 1 2 5 1 \n",
|
|
|
|
"1 2 1 2 2 1 \n",
|
|
|
|
"2 2 1 2 2 1 \n",
|
|
|
|
"3 1754 1 2 4 1 \n",
|
|
|
|
"4 4 1 3 6 1 \n",
|
|
|
|
"\n",
|
|
|
|
" street_id amount is_full_price name_event_types name_facilities \n",
|
|
|
|
"0 1 9.0 False offre muséale individuel mucem \n",
|
|
|
|
"1 1 9.5 False offre muséale individuel mucem \n",
|
|
|
|
"2 1 11.5 False offre muséale individuel mucem \n",
|
|
|
|
"3 1 8.0 False offre muséale individuel mucem \n",
|
|
|
|
"4 1 8.5 False non défini mucem "
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
"execution_count": 31,
|
2024-02-04 16:02:50 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
"products_global = uniform_product_df()\n",
|
|
|
|
"products_global.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
},
|
2024-02-05 22:03:49 +01:00
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"# Fusion des bases locales"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-02-05 22:10:07 +01:00
|
|
|
"execution_count": 32,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# Fusion liée au product\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_products_purchased = pd.merge(df1_tickets_kpi, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
"\n",
|
|
|
|
"# Fusion liée au customer\n",
|
|
|
|
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
|
|
|
"\n",
|
|
|
|
"# Fusion product et customer\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
"df1_customer_product = pd.merge(df1_customer, df1_products_purchased, on = 'customer_id', how = 'left')"
|
2024-02-05 22:03:49 +01:00
|
|
|
]
|
|
|
|
},
|
2024-02-05 12:51:35 +01:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
2024-02-05 22:03:49 +01:00
|
|
|
"id": "1e42a790-b215-4107-a969-85005da06ebd",
|
2024-02-05 12:51:35 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
2024-02-04 16:02:50 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.10.13"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 5
|
|
|
|
}
|