3179 lines
114 KiB
Plaintext
3179 lines
114 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Business Data Challenge - Team 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"import re"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
|
||
"metadata": {},
|
||
"source": [
|
||
"Configuration de l'accès aux données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Exemple sur Company 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Chargement données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data/1\"\n",
|
||
"liste_database = fs.ls(BUCKET)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15815/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" df = pd.read_csv(file_in)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# loop to create dataframes from liste\n",
|
||
"files_path = liste_database\n",
|
||
"\n",
|
||
"client_number = files_path[0].split(\"/\")[1]\n",
|
||
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
"\n",
|
||
"for i in range(len(files_path)) :\n",
|
||
" current_path = files_path[i]\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
" # the pattern of the name is df1xxx\n",
|
||
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||
" globals()[nom_dataframe] = df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Cleaning functions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def cleaning_date(df, column_name):\n",
|
||
" \"\"\"\n",
|
||
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" - df: DataFrame\n",
|
||
" Le DataFrame contenant la colonne à nettoyer.\n",
|
||
" - column_name: str\n",
|
||
" Le nom de la colonne à nettoyer.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" - DataFrame\n",
|
||
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
||
" \"\"\"\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Preprocessing"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
|
||
"metadata": {},
|
||
"source": [
|
||
"## customer_plus"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_customerplus(customerplus = None):\n",
|
||
"\n",
|
||
" customerplus_copy = customerplus.copy()\n",
|
||
" \n",
|
||
" # Passage en format date\n",
|
||
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
|
||
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
|
||
" \n",
|
||
" # Selection des variables\n",
|
||
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
||
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
||
"\n",
|
||
" return customerplus_copy\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Ticket area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fonction de nettoyage et selection\n",
|
||
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
||
" # Base des tickets\n",
|
||
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des fournisseurs\n",
|
||
" suppliers = suppliers[['id', 'name']]\n",
|
||
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des types de billets\n",
|
||
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
||
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des achats\n",
|
||
" # Nettoyage de la date d'achat\n",
|
||
" cleaning_date(purchases, 'purchase_date')\n",
|
||
" # Selection des variables\n",
|
||
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
||
"\n",
|
||
" # Fusions \n",
|
||
" # Fusion avec fournisseurs\n",
|
||
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec type de tickets\n",
|
||
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec achats\n",
|
||
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15815/1591303091.py:5: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"/tmp/ipykernel_15815/1591303091.py:9: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
"/tmp/ipykernel_15815/1591303091.py:13: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ticket_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" <th>type_of_ticket_name</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>225251</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070860</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070861</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070862</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070863</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
||
"0 13070859 225251 False vente en ligne \n",
|
||
"1 13070860 224914 False vente en ligne \n",
|
||
"2 13070861 224914 False vente en ligne \n",
|
||
"3 13070862 224914 False vente en ligne \n",
|
||
"4 13070863 224914 False vente en ligne \n",
|
||
"\n",
|
||
" type_of_ticket_name children purchase_date customer_id \n",
|
||
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Target area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
|
||
" # Target.csv cleaning\n",
|
||
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
||
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
||
" \n",
|
||
" # target_type cleaning\n",
|
||
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
||
" \n",
|
||
" #customer_target_mappings cleaning\n",
|
||
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
||
" \n",
|
||
" # Merge target et target_type\n",
|
||
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
||
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Merge\n",
|
||
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
||
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return targets_full"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15815/3848597476.py:4: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>target_name</th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>consentement optin mediation specialisee</th>\n",
|
||
" <td>150000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>consentement optin jeune public</th>\n",
|
||
" <td>149979</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>consentement optin b2c</th>\n",
|
||
" <td>108909</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Arenametrix_bascule tel vers sib</th>\n",
|
||
" <td>35216</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>consentement optout b2c</th>\n",
|
||
" <td>34523</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id\n",
|
||
"target_name \n",
|
||
"consentement optin mediation specialisee 150000\n",
|
||
"consentement optin jeune public 149979\n",
|
||
"consentement optin b2c 108909\n",
|
||
"Arenametrix_bascule tel vers sib 35216\n",
|
||
"consentement optout b2c 34523"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>target_name</th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Arenametrix_bascule tel vers sib</th>\n",
|
||
" <td>35216</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Autres_interet_exposition</th>\n",
|
||
" <td>1021</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>COM Inscrits NL générale (historique)</th>\n",
|
||
" <td>23005</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Contacts_prenomsdoubles</th>\n",
|
||
" <td>11643</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>DDCP MD Procès du Siècle</th>\n",
|
||
" <td>1684</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id\n",
|
||
"target_name \n",
|
||
"Arenametrix_bascule tel vers sib 35216\n",
|
||
"Autres_interet_exposition 1021\n",
|
||
"COM Inscrits NL générale (historique) 23005\n",
|
||
"Contacts_prenomsdoubles 11643\n",
|
||
"DDCP MD Procès du Siècle 1684"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
|
||
"df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Campaings area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
|
||
" # campaign_stats cleaning \n",
|
||
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
||
" cleaning_date(campaign_stats, 'opened_at')\n",
|
||
" cleaning_date(campaign_stats, 'sent_at')\n",
|
||
" cleaning_date(campaign_stats, 'delivered_at')\n",
|
||
" \n",
|
||
" # campaigns cleaning\n",
|
||
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
||
" cleaning_date(campaigns, 'campaign_sent_at')\n",
|
||
" \n",
|
||
" # Merge \n",
|
||
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
|
||
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return campaigns_full"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>opened_at</th>\n",
|
||
" <th>sent_at</th>\n",
|
||
" <th>delivered_at</th>\n",
|
||
" <th>campaign_name</th>\n",
|
||
" <th>campaign_service_id</th>\n",
|
||
" <th>campaign_sent_at</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>19793</td>\n",
|
||
" <td>112597</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
||
" <td>2021-03-28 16:24:18+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>14211</td>\n",
|
||
" <td>113666</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
||
" <td>2021-03-28 16:21:02+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13150</td>\n",
|
||
" <td>280561</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
||
" <td>2021-03-28 16:08:45+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>7073</td>\n",
|
||
" <td>101007</td>\n",
|
||
" <td>2021-03-28 18:11:06+00:00</td>\n",
|
||
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
||
" <td>2021-03-28 16:09:47+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5175</td>\n",
|
||
" <td>103972</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:06+00:00</td>\n",
|
||
" <td>2021-03-28 16:05:03+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id customer_id opened_at sent_at \\\n",
|
||
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
|
||
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
|
||
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
|
||
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
|
||
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
|
||
"\n",
|
||
" delivered_at campaign_name \\\n",
|
||
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"\n",
|
||
" campaign_service_id campaign_sent_at \n",
|
||
"0 404 2021-03-27 23:00:00+00:00 \n",
|
||
"1 404 2021-03-27 23:00:00+00:00 \n",
|
||
"2 404 2021-03-27 23:00:00+00:00 \n",
|
||
"3 404 2021-03-27 23:00:00+00:00 \n",
|
||
"4 404 2021-03-27 23:00:00+00:00 "
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_information.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def campaigns_kpi_function(campaigns_information = None):\n",
|
||
" # Nombre de campagnes de mails\n",
|
||
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
||
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
|
||
" # Temps d'ouverture en min moyen \n",
|
||
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
|
||
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
|
||
"\n",
|
||
" # Nombre de mail ouvert \n",
|
||
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
|
||
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
|
||
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
||
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
|
||
"\n",
|
||
" # Fusion des indicateurs\n",
|
||
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
|
||
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
|
||
"\n",
|
||
" # Remplir les NaN : nb_campaigns_opened\n",
|
||
" campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
|
||
"\n",
|
||
" # Remplir les NaT : time_to_open (??)\n",
|
||
"\n",
|
||
" return campaigns_reduced\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15815/3700263836.py:11: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>222</td>\n",
|
||
" <td>124.0</td>\n",
|
||
" <td>1 days 00:28:30.169354838</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>1 days 04:31:01.428571428</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
|
||
"0 2 4 0.0 NaT\n",
|
||
"1 3 222 124.0 1 days 00:28:30.169354838\n",
|
||
"2 4 7 7.0 1 days 04:31:01.428571428\n",
|
||
"3 5 4 0.0 NaT\n",
|
||
"4 6 20 0.0 NaT"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_kpi.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create Products Table"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
|
||
"metadata": {},
|
||
"source": [
|
||
"Some useful functions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data\"\n",
|
||
"directory_path = '1'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def display_databases(file_name):\n",
|
||
" \"\"\"\n",
|
||
" This function returns the file from s3 storage\n",
|
||
" \"\"\"\n",
|
||
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
|
||
" print(\"File path : \", file_path)\n",
|
||
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in, sep=\",\")\n",
|
||
" \n",
|
||
" print(\"Shape : \", df.shape)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"def remove_horodates(df):\n",
|
||
" \"\"\"\n",
|
||
" this function remove horodate columns like created_at and updated_at\n",
|
||
" \"\"\"\n",
|
||
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"def order_columns_id(df):\n",
|
||
" \"\"\"\n",
|
||
" this function puts all id columns at the beginning in order to read the dataset easier\n",
|
||
" \"\"\"\n",
|
||
" substring = 'id'\n",
|
||
" id_columns = [col for col in df.columns if substring in col]\n",
|
||
" remaining_col = [col for col in df.columns if substring not in col]\n",
|
||
" new_order = id_columns + remaining_col\n",
|
||
" return df[new_order]\n",
|
||
"\n",
|
||
"\n",
|
||
"def process_df_2(df):\n",
|
||
" \"\"\"\n",
|
||
" This function organizes dataframe\n",
|
||
" \"\"\"\n",
|
||
" df = remove_horodates(df)\n",
|
||
" print(\"Number of columns : \", len(df.columns))\n",
|
||
" df = order_columns_id(df)\n",
|
||
" print(\"Columns : \", df.columns)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"def load_dataset(name):\n",
|
||
" \"\"\"\n",
|
||
" This function loads csv file\n",
|
||
" \"\"\"\n",
|
||
" df = display_databases(name)\n",
|
||
" df = process_df_2(df)\n",
|
||
" # drop na :\n",
|
||
" #df = df.dropna(axis=1, thresh=len(df))\n",
|
||
" # if identifier in table : delete it\n",
|
||
" if 'identifier' in df.columns:\n",
|
||
" df = df.drop(columns = 'identifier')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create theme tables"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def create_products_table():\n",
|
||
" # first merge products and categories\n",
|
||
" print(\"first merge products and categories\")\n",
|
||
" products = load_dataset(\"1products.csv\")\n",
|
||
" categories = load_dataset(\"1categories.csv\")\n",
|
||
" # Drop useless columns\n",
|
||
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
|
||
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
|
||
"\n",
|
||
" #Merge\n",
|
||
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
|
||
" right_on = 'id', suffixes=('_products', '_categories'))\n",
|
||
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
|
||
" \n",
|
||
" # Second merge products_theme and type of categories\n",
|
||
" print(\"Second merge products_theme and type of categories\")\n",
|
||
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
|
||
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
|
||
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
|
||
" right_on = 'category_id' )\n",
|
||
"\n",
|
||
" # Index cleaning\n",
|
||
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
|
||
" products_theme = order_columns_id(products_theme)\n",
|
||
" return products_theme\n",
|
||
"\n",
|
||
"\n",
|
||
"def create_events_table():\n",
|
||
" # first merge events and seasons : \n",
|
||
" print(\"first merge events and seasons : \")\n",
|
||
" events = load_dataset(\"1events.csv\")\n",
|
||
" seasons = load_dataset(\"1seasons.csv\")\n",
|
||
"\n",
|
||
" # Drop useless columns\n",
|
||
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
|
||
" seasons = seasons.drop(columns = ['start_date_time'])\n",
|
||
" \n",
|
||
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
|
||
"\n",
|
||
" # Secondly merge events_theme and event_types\n",
|
||
" print(\"Secondly merge events_theme and event_types : \")\n",
|
||
" event_types = load_dataset(\"1event_types.csv\")\n",
|
||
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
|
||
" \n",
|
||
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
|
||
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
|
||
" events_theme = events_theme.drop(columns = 'id')\n",
|
||
"\n",
|
||
" # thirdly merge events_theme and facilities\n",
|
||
" print(\"thirdly merge events_theme and facilities : \")\n",
|
||
" facilities = load_dataset(\"1facilities.csv\")\n",
|
||
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
|
||
" \n",
|
||
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
|
||
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
|
||
" events_theme = events_theme.drop(columns = 'id')\n",
|
||
"\n",
|
||
" # Index cleaning\n",
|
||
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
|
||
" events_theme = order_columns_id(events_theme)\n",
|
||
" return events_theme\n",
|
||
"\n",
|
||
"\n",
|
||
"def create_representations_table():\n",
|
||
" representations = load_dataset(\"1representations.csv\")\n",
|
||
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
|
||
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
|
||
" 'representation_type_id'])\n",
|
||
" \n",
|
||
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
|
||
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
|
||
"\n",
|
||
" representations_theme = representations.merge(representations_capacity, how='left',\n",
|
||
" left_on='id', right_on='representation_id',\n",
|
||
" suffixes=('_representation', '_representation_cap'))\n",
|
||
" # index cleaning\n",
|
||
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
|
||
" representations_theme = order_columns_id(representations_theme)\n",
|
||
" return representations_theme"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"first merge products and categories\n",
|
||
"File path : bdc2324-data/1/1products.csv\n",
|
||
"Shape : (94803, 14)\n",
|
||
"Number of columns : 12\n",
|
||
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
||
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
|
||
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1categories.csv\n",
|
||
"Shape : (27, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
|
||
"Second merge products_theme and type of categories\n",
|
||
"File path : bdc2324-data/1/1type_of_categories.csv\n",
|
||
"Shape : (5, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id_products</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>pricing_formula_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" <th>products_group_id</th>\n",
|
||
" <th>product_pack_id</th>\n",
|
||
" <th>type_of_id</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>name_categories</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>10682</td>\n",
|
||
" <td>914</td>\n",
|
||
" <td>114</td>\n",
|
||
" <td>41</td>\n",
|
||
" <td>10655</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv activité tr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>478</td>\n",
|
||
" <td>273</td>\n",
|
||
" <td>131</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>471</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>20873</td>\n",
|
||
" <td>275</td>\n",
|
||
" <td>137</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20825</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>11.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>157142</td>\n",
|
||
" <td>82519</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>156773</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1341</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>93</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1175</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>8.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id_products representation_id pricing_formula_id category_id \\\n",
|
||
"0 10682 914 114 41 \n",
|
||
"1 478 273 131 1 \n",
|
||
"2 20873 275 137 1 \n",
|
||
"3 157142 82519 9 5 \n",
|
||
"4 1341 9 93 1 \n",
|
||
"\n",
|
||
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
|
||
"0 10655 1 NaN 9.0 False \n",
|
||
"1 471 1 12.0 9.5 False \n",
|
||
"2 20825 1 12.0 11.5 False \n",
|
||
"3 156773 1 NaN 8.0 False \n",
|
||
"4 1175 1 12.0 8.5 False \n",
|
||
"\n",
|
||
" name_categories \n",
|
||
"0 indiv activité tr \n",
|
||
"1 indiv entrées tp \n",
|
||
"2 indiv entrées tp \n",
|
||
"3 indiv entrées tr \n",
|
||
"4 indiv entrées tp "
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"products_theme = create_products_table()\n",
|
||
"products_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"first merge events and seasons : \n",
|
||
"File path : bdc2324-data/1/1events.csv\n",
|
||
"Shape : (1232, 12)\n",
|
||
"Number of columns : 10\n",
|
||
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
|
||
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1seasons.csv\n",
|
||
"Shape : (13, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
|
||
"Secondly merge events_theme and event_types : \n",
|
||
"File path : bdc2324-data/1/1event_types.csv\n",
|
||
"Shape : (9, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
|
||
"thirdly merge events_theme and facilities : \n",
|
||
"File path : bdc2324-data/1/1facilities.csv\n",
|
||
"Shape : (2, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>season_id</th>\n",
|
||
" <th>facility_id</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>event_type_key_id</th>\n",
|
||
" <th>facility_key_id</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>name_events</th>\n",
|
||
" <th>name_seasons</th>\n",
|
||
" <th>name_event_types</th>\n",
|
||
" <th>name_facilities</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>192</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>frontières</td>\n",
|
||
" <td>2018</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>30329</td>\n",
|
||
" <td>2767</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>offre muséale groupe</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>161</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>visite contée les chercheurs d'or indiv</td>\n",
|
||
" <td>2018</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5957</td>\n",
|
||
" <td>582</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
|
||
" <td>2021</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>8337</td>\n",
|
||
" <td>582</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>jeff koons épisodes 4</td>\n",
|
||
" <td>2021</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
|
||
"0 192 16 1 4 4 \n",
|
||
"1 30329 2767 1 5 5 \n",
|
||
"2 161 16 1 2 2 \n",
|
||
"3 5957 582 1 4 4 \n",
|
||
"4 8337 582 1 4 4 \n",
|
||
"\n",
|
||
" facility_key_id street_id \\\n",
|
||
"0 1 1 \n",
|
||
"1 1 1 \n",
|
||
"2 1 1 \n",
|
||
"3 1 1 \n",
|
||
"4 1 1 \n",
|
||
"\n",
|
||
" name_events name_seasons \\\n",
|
||
"0 frontières 2018 \n",
|
||
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
|
||
"2 visite contée les chercheurs d'or indiv 2018 \n",
|
||
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
|
||
"4 jeff koons épisodes 4 2021 \n",
|
||
"\n",
|
||
" name_event_types name_facilities \n",
|
||
"0 spectacle vivant mucem \n",
|
||
"1 offre muséale groupe mucem \n",
|
||
"2 offre muséale individuel mucem \n",
|
||
"3 spectacle vivant mucem \n",
|
||
"4 spectacle vivant mucem "
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"events_theme= create_events_table()\n",
|
||
"events_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"File path : bdc2324-data/1/1representations.csv\n",
|
||
"Shape : (36095, 16)\n",
|
||
"Number of columns : 14\n",
|
||
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
|
||
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
|
||
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
|
||
"Shape : (65241, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
|
||
" 'max_filling'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>id_representation_cap</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>12384</td>\n",
|
||
" <td>123058</td>\n",
|
||
" <td>84820</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>2514</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>384</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>2515</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>383</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" event_id id_representation_cap representation_id category_id\n",
|
||
"0 12384 123058 84820 2\n",
|
||
"1 37 2514 269 2\n",
|
||
"2 37 384 269 5\n",
|
||
"3 37 2515 269 10\n",
|
||
"4 37 383 269 1"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"representation_theme = create_representations_table()\n",
|
||
"representation_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create uniform product database "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def uniform_product_df():\n",
|
||
" \"\"\"\n",
|
||
" This function returns the uniform product dataset\n",
|
||
" \"\"\"\n",
|
||
" print(\"Products theme columns : \", products_theme.columns)\n",
|
||
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
||
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
||
"\n",
|
||
" products_global = products_theme.merge(representation_theme, how='left',\n",
|
||
" on= [\"representation_id\", \"category_id\"])\n",
|
||
" \n",
|
||
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
|
||
" suffixes = (\"_representation\", \"_event\"))\n",
|
||
" \n",
|
||
" products_global = order_columns_id(products_global)\n",
|
||
"\n",
|
||
" # remove useless columns \n",
|
||
" products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n",
|
||
" return products_global"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
||
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
|
||
" 'is_full_price', 'name_categories'],\n",
|
||
" dtype='object')\n",
|
||
"\n",
|
||
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
|
||
" 'category_id'],\n",
|
||
" dtype='object')\n",
|
||
"\n",
|
||
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
|
||
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
|
||
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id_products</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>pricing_formula_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" <th>products_group_id</th>\n",
|
||
" <th>product_pack_id</th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>id_representation_cap</th>\n",
|
||
" <th>season_id</th>\n",
|
||
" <th>facility_id</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>event_type_key_id</th>\n",
|
||
" <th>facility_key_id</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>name_categories</th>\n",
|
||
" <th>name_events</th>\n",
|
||
" <th>name_seasons</th>\n",
|
||
" <th>name_event_types</th>\n",
|
||
" <th>name_facilities</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>10682</td>\n",
|
||
" <td>914</td>\n",
|
||
" <td>114</td>\n",
|
||
" <td>41</td>\n",
|
||
" <td>10655</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>132</td>\n",
|
||
" <td>8789</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv activité tr</td>\n",
|
||
" <td>visite-jeu \"le classico des minots\" (1h30)</td>\n",
|
||
" <td>2017</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>478</td>\n",
|
||
" <td>273</td>\n",
|
||
" <td>131</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>471</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>390</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>billet mucem picasso</td>\n",
|
||
" <td>2016</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>20873</td>\n",
|
||
" <td>275</td>\n",
|
||
" <td>137</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20825</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>395</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>11.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>billet mucem picasso</td>\n",
|
||
" <td>2016</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>157142</td>\n",
|
||
" <td>82519</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>156773</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12365</td>\n",
|
||
" <td>120199</td>\n",
|
||
" <td>1754</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1341</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>93</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1175</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" <td>2017</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id_products representation_id pricing_formula_id category_id \\\n",
|
||
"0 10682 914 114 41 \n",
|
||
"1 478 273 131 1 \n",
|
||
"2 20873 275 137 1 \n",
|
||
"3 157142 82519 9 5 \n",
|
||
"4 1341 9 93 1 \n",
|
||
"\n",
|
||
" products_group_id product_pack_id event_id id_representation_cap \\\n",
|
||
"0 10655 1 132 8789 \n",
|
||
"1 471 1 37 390 \n",
|
||
"2 20825 1 37 395 \n",
|
||
"3 156773 1 12365 120199 \n",
|
||
"4 1175 1 8 21 \n",
|
||
"\n",
|
||
" season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
|
||
"0 4 1 ... 5 1 1 \n",
|
||
"1 2 1 ... 2 1 1 \n",
|
||
"2 2 1 ... 2 1 1 \n",
|
||
"3 1754 1 ... 4 1 1 \n",
|
||
"4 4 1 ... 6 1 1 \n",
|
||
"\n",
|
||
" amount is_full_price name_categories \\\n",
|
||
"0 9.0 False indiv activité tr \n",
|
||
"1 9.5 False indiv entrées tp \n",
|
||
"2 11.5 False indiv entrées tp \n",
|
||
"3 8.0 False indiv entrées tr \n",
|
||
"4 8.5 False indiv entrées tp \n",
|
||
"\n",
|
||
" name_events name_seasons \\\n",
|
||
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
|
||
"1 billet mucem picasso 2016 \n",
|
||
"2 billet mucem picasso 2016 \n",
|
||
"3 NaN NaN \n",
|
||
"4 non défini 2017 \n",
|
||
"\n",
|
||
" name_event_types name_facilities \n",
|
||
"0 offre muséale individuel mucem \n",
|
||
"1 offre muséale individuel mucem \n",
|
||
"2 offre muséale individuel mucem \n",
|
||
"3 offre muséale individuel mucem \n",
|
||
"4 non défini mucem \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"products_global = uniform_product_df()\n",
|
||
"products_global.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion liée au product\n",
|
||
"df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "52db7bcb-3fb7-48e5-b612-4e22bdab4a94",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
|
||
"metadata": {},
|
||
"source": [
|
||
"### KPI tickets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "665a5925-9c0e-425a-8f11-c33a0a9ec444",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['ticket_id', 'product_id', 'is_from_subscription', 'supplier_name',\n",
|
||
" 'type_of_ticket_name', 'children', 'purchase_date', 'customer_id',\n",
|
||
" 'id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
||
" 'products_group_id', 'product_pack_id', 'event_id',\n",
|
||
" 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n",
|
||
" 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n",
|
||
" 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n",
|
||
" 'name_event_types', 'name_facilities'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_products_purchased.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "b913a69e-3146-4919-b5f6-a6108532bffa",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array(['spectacle vivant', 'offre muséale individuel', 'formule adhésion',\n",
|
||
" 'offre muséale groupe'], dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_products_purchased['name_event_types'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "e01e8cf9-1187-4a4b-993d-b7b4321cd8f0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "3d8b0875-b409-44ce-b688-d9d6758782d3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ticket_id</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>type_of_ticket_name</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>name_event_types</th>\n",
|
||
" <th>name_facilities</th>\n",
|
||
" <th>name_categories</th>\n",
|
||
" <th>name_events</th>\n",
|
||
" <th>name_seasons</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>48187</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv prog enfant</td>\n",
|
||
" <td>l'école des magiciens</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070855</td>\n",
|
||
" <td>48187</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv prog enfant</td>\n",
|
||
" <td>l'école des magiciens</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070856</td>\n",
|
||
" <td>48187</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv prog enfant</td>\n",
|
||
" <td>l'école des magiciens</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070857</td>\n",
|
||
" <td>48187</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv prog enfant</td>\n",
|
||
" <td>l'école des magiciens</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070858</td>\n",
|
||
" <td>48187</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv prog enfant</td>\n",
|
||
" <td>l'école des magiciens</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826667</th>\n",
|
||
" <td>18643494</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>2022-08-02 12:18:16+00:00</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>en nb entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826668</th>\n",
|
||
" <td>18643495</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>2022-08-02 12:18:16+00:00</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>en nb entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826669</th>\n",
|
||
" <td>18643496</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>2022-08-02 12:18:16+00:00</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>en nb entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826670</th>\n",
|
||
" <td>18643497</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>2022-08-02 12:18:16+00:00</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>en nb entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826671</th>\n",
|
||
" <td>19853111</td>\n",
|
||
" <td>62763</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>2022-11-04 14:25:42+00:00</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" <td>indiv entrées gr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2022</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1826672 rows × 14 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ticket_id customer_id event_type_id supplier_name \\\n",
|
||
"0 13070859 48187 4 vente en ligne \n",
|
||
"1 13070855 48187 4 vente en ligne \n",
|
||
"2 13070856 48187 4 vente en ligne \n",
|
||
"3 13070857 48187 4 vente en ligne \n",
|
||
"4 13070858 48187 4 vente en ligne \n",
|
||
"... ... ... ... ... \n",
|
||
"1826667 18643494 81 4 vad \n",
|
||
"1826668 18643495 81 4 vad \n",
|
||
"1826669 18643496 81 4 vad \n",
|
||
"1826670 18643497 81 4 vad \n",
|
||
"1826671 19853111 62763 4 vad \n",
|
||
"\n",
|
||
" purchase_date type_of_ticket_name amount \\\n",
|
||
"0 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
||
"1 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
||
"2 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
||
"3 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
||
"4 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
|
||
"1826668 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
|
||
"1826669 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
|
||
"1826670 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
|
||
"1826671 2022-11-04 14:25:42+00:00 Billet en nombre 0.0 \n",
|
||
"\n",
|
||
" children is_full_price name_event_types name_facilities \\\n",
|
||
"0 pricing_formula False spectacle vivant mucem \n",
|
||
"1 pricing_formula False spectacle vivant mucem \n",
|
||
"2 pricing_formula False spectacle vivant mucem \n",
|
||
"3 pricing_formula False spectacle vivant mucem \n",
|
||
"4 pricing_formula False spectacle vivant mucem \n",
|
||
"... ... ... ... ... \n",
|
||
"1826667 pricing_formula False spectacle vivant mucem \n",
|
||
"1826668 pricing_formula False spectacle vivant mucem \n",
|
||
"1826669 pricing_formula False spectacle vivant mucem \n",
|
||
"1826670 pricing_formula False spectacle vivant mucem \n",
|
||
"1826671 pricing_formula False spectacle vivant mucem \n",
|
||
"\n",
|
||
" name_categories name_events name_seasons \n",
|
||
"0 indiv prog enfant l'école des magiciens 2018 \n",
|
||
"1 indiv prog enfant l'école des magiciens 2018 \n",
|
||
"2 indiv prog enfant l'école des magiciens 2018 \n",
|
||
"3 indiv prog enfant l'école des magiciens 2018 \n",
|
||
"4 indiv prog enfant l'école des magiciens 2018 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 en nb entrées tr NaN 2022 \n",
|
||
"1826668 en nb entrées tr NaN 2022 \n",
|
||
"1826669 en nb entrées tr NaN 2022 \n",
|
||
"1826670 en nb entrées tr NaN 2022 \n",
|
||
"1826671 indiv entrées gr NaN 2022 \n",
|
||
"\n",
|
||
"[1826672 rows x 14 columns]"
|
||
]
|
||
},
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Importance des suppliers\n",
|
||
"df1_products_purchased_reduced"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Nombre de client assistant à plus de 2 type d'événement\n",
|
||
"nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def tickets_kpi_function(tickets_information = None):\n",
|
||
" tickets_information_copy = tickets_information.copy()\n",
|
||
" tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n",
|
||
" tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n",
|
||
" .groupby([ 'customer_id']) # 'event_type_id',\n",
|
||
" .agg({'ticket_id': 'count', \n",
|
||
" 'amount' : 'sum',\n",
|
||
" 'supplier_name': 'nunique',\n",
|
||
" 'purchase_date_max' : 'max',\n",
|
||
" 'purchase_date' : 'min'})\n",
|
||
" .reset_index()\n",
|
||
" )\n",
|
||
" \n",
|
||
" tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n",
|
||
" 'amount' : 'total_amount',\n",
|
||
" 'supplier_name' : 'nb_suppliers', \n",
|
||
" 'purchase_date' : 'purchase_date_min'}, inplace = True)\n",
|
||
" \n",
|
||
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
|
||
" \n",
|
||
" return tickets_kpi\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>time_between_purchase</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1256574</td>\n",
|
||
" <td>8830567.5</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2023-11-08 15:59:45+00:00</td>\n",
|
||
" <td>2013-06-10 10:37:58+00:00</td>\n",
|
||
" <td>3803 days 05:21:47</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3615</th>\n",
|
||
" <td>6733</td>\n",
|
||
" <td>35527</td>\n",
|
||
" <td>1188.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2023-11-03 09:42:40+00:00</td>\n",
|
||
" <td>2015-09-09 13:48:38+00:00</td>\n",
|
||
" <td>2976 days 19:54:02</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>39</th>\n",
|
||
" <td>41</td>\n",
|
||
" <td>16263</td>\n",
|
||
" <td>37642.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2023-10-25 09:13:16+00:00</td>\n",
|
||
" <td>2014-01-23 16:56:57+00:00</td>\n",
|
||
" <td>3561 days 16:16:19</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>5871</td>\n",
|
||
" <td>38767.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-11-04 13:46:59+00:00</td>\n",
|
||
" <td>2018-04-04 07:46:31+00:00</td>\n",
|
||
" <td>2040 days 06:00:28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32809</th>\n",
|
||
" <td>63488</td>\n",
|
||
" <td>5851</td>\n",
|
||
" <td>64350.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-08-25 13:08:38+00:00</td>\n",
|
||
" <td>2020-08-18 08:32:57+00:00</td>\n",
|
||
" <td>737 days 04:35:41</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3708</th>\n",
|
||
" <td>6916</td>\n",
|
||
" <td>5482</td>\n",
|
||
" <td>51489.5</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2021-08-26 12:49:17+00:00</td>\n",
|
||
" <td>2018-03-26 11:13:43+00:00</td>\n",
|
||
" <td>1249 days 01:35:34</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32616</th>\n",
|
||
" <td>63194</td>\n",
|
||
" <td>4507</td>\n",
|
||
" <td>13232.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2022-09-07 12:55:33+00:00</td>\n",
|
||
" <td>2017-11-28 13:52:15+00:00</td>\n",
|
||
" <td>1743 days 23:03:18</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>78</th>\n",
|
||
" <td>81</td>\n",
|
||
" <td>3562</td>\n",
|
||
" <td>38746.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-08-30 11:51:34+00:00</td>\n",
|
||
" <td>2017-01-05 13:04:58+00:00</td>\n",
|
||
" <td>2062 days 22:46:36</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>35295</th>\n",
|
||
" <td>84002</td>\n",
|
||
" <td>3403</td>\n",
|
||
" <td>19830.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2023-11-06 15:59:22+00:00</td>\n",
|
||
" <td>2021-05-28 10:22:33+00:00</td>\n",
|
||
" <td>892 days 05:36:49</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3377</th>\n",
|
||
" <td>5618</td>\n",
|
||
" <td>3294</td>\n",
|
||
" <td>31684.5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-02-24 07:47:20+00:00</td>\n",
|
||
" <td>2018-10-25 11:04:24+00:00</td>\n",
|
||
" <td>1217 days 20:42:56</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30011</th>\n",
|
||
" <td>59259</td>\n",
|
||
" <td>2591</td>\n",
|
||
" <td>4350.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2023-06-12 14:05:19+00:00</td>\n",
|
||
" <td>2019-11-25 08:52:48+00:00</td>\n",
|
||
" <td>1295 days 05:12:31</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>34937</th>\n",
|
||
" <td>74876</td>\n",
|
||
" <td>2571</td>\n",
|
||
" <td>2600.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-10-02 08:13:05+00:00</td>\n",
|
||
" <td>2018-02-08 12:54:01+00:00</td>\n",
|
||
" <td>2061 days 19:19:04</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>270</th>\n",
|
||
" <td>295</td>\n",
|
||
" <td>2570</td>\n",
|
||
" <td>17678.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2023-10-16 10:19:22+00:00</td>\n",
|
||
" <td>2014-01-24 15:16:17+00:00</td>\n",
|
||
" <td>3551 days 19:03:05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>866</th>\n",
|
||
" <td>1221</td>\n",
|
||
" <td>2320</td>\n",
|
||
" <td>9652.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2022-09-19 12:55:15+00:00</td>\n",
|
||
" <td>2017-03-29 08:00:09+00:00</td>\n",
|
||
" <td>2000 days 04:55:06</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1022</th>\n",
|
||
" <td>1429</td>\n",
|
||
" <td>2249</td>\n",
|
||
" <td>3500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2023-11-06 08:30:37+00:00</td>\n",
|
||
" <td>2014-12-03 14:56:38+00:00</td>\n",
|
||
" <td>3259 days 17:33:59</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3922</th>\n",
|
||
" <td>7249</td>\n",
|
||
" <td>1827</td>\n",
|
||
" <td>13385.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-10-26 12:28:40+00:00</td>\n",
|
||
" <td>2019-05-07 12:34:56+00:00</td>\n",
|
||
" <td>902 days 23:53:44</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>54425</th>\n",
|
||
" <td>1070539</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>19800.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-07-25 12:49:27+00:00</td>\n",
|
||
" <td>2022-05-02 16:09:03+00:00</td>\n",
|
||
" <td>83 days 20:40:24</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>69520</th>\n",
|
||
" <td>1216801</td>\n",
|
||
" <td>1623</td>\n",
|
||
" <td>12562.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-09-29 16:34:38+00:00</td>\n",
|
||
" <td>2023-06-16 14:16:04+00:00</td>\n",
|
||
" <td>105 days 02:18:34</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30056</th>\n",
|
||
" <td>59330</td>\n",
|
||
" <td>1551</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2023-11-06 10:22:14+00:00</td>\n",
|
||
" <td>2018-02-02 08:53:51+00:00</td>\n",
|
||
" <td>2103 days 01:28:23</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3243</th>\n",
|
||
" <td>5441</td>\n",
|
||
" <td>1544</td>\n",
|
||
" <td>14133.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2022-09-22 08:21:47+00:00</td>\n",
|
||
" <td>2017-12-14 12:50:23+00:00</td>\n",
|
||
" <td>1742 days 19:31:24</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>55195</th>\n",
|
||
" <td>1084435</td>\n",
|
||
" <td>1500</td>\n",
|
||
" <td>16500.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-09-27 14:32:13+00:00</td>\n",
|
||
" <td>2022-05-18 08:04:41+00:00</td>\n",
|
||
" <td>132 days 06:27:32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28983</th>\n",
|
||
" <td>57816</td>\n",
|
||
" <td>1485</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-05-22 07:30:55+00:00</td>\n",
|
||
" <td>2019-01-21 14:19:18+00:00</td>\n",
|
||
" <td>1581 days 17:11:37</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2231</th>\n",
|
||
" <td>2942</td>\n",
|
||
" <td>1307</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-06-29 09:33:58+00:00</td>\n",
|
||
" <td>2017-10-25 15:06:58+00:00</td>\n",
|
||
" <td>2072 days 18:27:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>24</td>\n",
|
||
" <td>1266</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-10-19 07:20:48+00:00</td>\n",
|
||
" <td>2015-09-30 16:07:52+00:00</td>\n",
|
||
" <td>2940 days 15:12:56</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4513</th>\n",
|
||
" <td>9592</td>\n",
|
||
" <td>1211</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2023-10-17 09:39:40+00:00</td>\n",
|
||
" <td>2018-02-25 07:17:19+00:00</td>\n",
|
||
" <td>2060 days 02:22:21</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2936</th>\n",
|
||
" <td>5059</td>\n",
|
||
" <td>1186</td>\n",
|
||
" <td>6308.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2023-05-22 13:41:22+00:00</td>\n",
|
||
" <td>2018-02-01 11:16:51+00:00</td>\n",
|
||
" <td>1936 days 02:24:31</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11484</th>\n",
|
||
" <td>25100</td>\n",
|
||
" <td>1123</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-07-13 07:39:57+00:00</td>\n",
|
||
" <td>2015-12-21 15:38:05+00:00</td>\n",
|
||
" <td>2030 days 16:01:52</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>934</th>\n",
|
||
" <td>1326</td>\n",
|
||
" <td>1098</td>\n",
|
||
" <td>798.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2023-02-01 08:39:45+00:00</td>\n",
|
||
" <td>2018-02-13 13:13:48+00:00</td>\n",
|
||
" <td>1813 days 19:25:57</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30156</th>\n",
|
||
" <td>59490</td>\n",
|
||
" <td>1088</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2023-10-05 08:23:50+00:00</td>\n",
|
||
" <td>2019-12-06 12:59:20+00:00</td>\n",
|
||
" <td>1398 days 19:24:30</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>36478</th>\n",
|
||
" <td>251268</td>\n",
|
||
" <td>1086</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2023-06-30 07:22:46+00:00</td>\n",
|
||
" <td>2018-02-02 09:06:22+00:00</td>\n",
|
||
" <td>1973 days 22:16:24</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id nb_tickets total_amount nb_suppliers \\\n",
|
||
"0 1 1256574 8830567.5 7 \n",
|
||
"3615 6733 35527 1188.0 4 \n",
|
||
"39 41 16263 37642.0 6 \n",
|
||
"11 12 5871 38767.0 2 \n",
|
||
"32809 63488 5851 64350.0 1 \n",
|
||
"3708 6916 5482 51489.5 2 \n",
|
||
"32616 63194 4507 13232.0 3 \n",
|
||
"78 81 3562 38746.0 1 \n",
|
||
"35295 84002 3403 19830.0 4 \n",
|
||
"3377 5618 3294 31684.5 1 \n",
|
||
"30011 59259 2591 4350.0 3 \n",
|
||
"34937 74876 2571 2600.0 2 \n",
|
||
"270 295 2570 17678.5 6 \n",
|
||
"866 1221 2320 9652.0 2 \n",
|
||
"1022 1429 2249 3500.0 4 \n",
|
||
"3922 7249 1827 13385.0 1 \n",
|
||
"54425 1070539 1800 19800.0 1 \n",
|
||
"69520 1216801 1623 12562.0 2 \n",
|
||
"30056 59330 1551 0.0 1 \n",
|
||
"3243 5441 1544 14133.0 2 \n",
|
||
"55195 1084435 1500 16500.0 1 \n",
|
||
"28983 57816 1485 0.0 2 \n",
|
||
"2231 2942 1307 100.0 2 \n",
|
||
"23 24 1266 0.0 2 \n",
|
||
"4513 9592 1211 62.0 4 \n",
|
||
"2936 5059 1186 6308.0 3 \n",
|
||
"11484 25100 1123 0.0 1 \n",
|
||
"934 1326 1098 798.0 3 \n",
|
||
"30156 59490 1088 0.0 1 \n",
|
||
"36478 251268 1086 0.0 2 \n",
|
||
"\n",
|
||
" purchase_date_max purchase_date_min \\\n",
|
||
"0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n",
|
||
"3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n",
|
||
"39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n",
|
||
"11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n",
|
||
"32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n",
|
||
"3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n",
|
||
"32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n",
|
||
"78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n",
|
||
"35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n",
|
||
"3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n",
|
||
"30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n",
|
||
"34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n",
|
||
"270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n",
|
||
"866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n",
|
||
"1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n",
|
||
"3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n",
|
||
"54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n",
|
||
"69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n",
|
||
"30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n",
|
||
"3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n",
|
||
"55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n",
|
||
"28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n",
|
||
"2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n",
|
||
"23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n",
|
||
"4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n",
|
||
"2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n",
|
||
"11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n",
|
||
"934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n",
|
||
"30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n",
|
||
"36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n",
|
||
"\n",
|
||
" time_between_purchase \n",
|
||
"0 3803 days 05:21:47 \n",
|
||
"3615 2976 days 19:54:02 \n",
|
||
"39 3561 days 16:16:19 \n",
|
||
"11 2040 days 06:00:28 \n",
|
||
"32809 737 days 04:35:41 \n",
|
||
"3708 1249 days 01:35:34 \n",
|
||
"32616 1743 days 23:03:18 \n",
|
||
"78 2062 days 22:46:36 \n",
|
||
"35295 892 days 05:36:49 \n",
|
||
"3377 1217 days 20:42:56 \n",
|
||
"30011 1295 days 05:12:31 \n",
|
||
"34937 2061 days 19:19:04 \n",
|
||
"270 3551 days 19:03:05 \n",
|
||
"866 2000 days 04:55:06 \n",
|
||
"1022 3259 days 17:33:59 \n",
|
||
"3922 902 days 23:53:44 \n",
|
||
"54425 83 days 20:40:24 \n",
|
||
"69520 105 days 02:18:34 \n",
|
||
"30056 2103 days 01:28:23 \n",
|
||
"3243 1742 days 19:31:24 \n",
|
||
"55195 132 days 06:27:32 \n",
|
||
"28983 1581 days 17:11:37 \n",
|
||
"2231 2072 days 18:27:00 \n",
|
||
"23 2940 days 15:12:56 \n",
|
||
"4513 2060 days 02:22:21 \n",
|
||
"2936 1936 days 02:24:31 \n",
|
||
"11484 2030 days 16:01:52 \n",
|
||
"934 1813 days 19:25:57 \n",
|
||
"30156 1398 days 19:24:30 \n",
|
||
"36478 1973 days 22:16:24 "
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Fusion des bases locales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec KPI liés au customer\n",
|
||
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "9740d64a-e5eb-4967-a534-ca6177546465",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>birthdate</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>profession</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>average_ticket_basket</th>\n",
|
||
" <th>total_price</th>\n",
|
||
" <th>purchase_count</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>12751</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>12825</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>11261</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13071</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>653061</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0 days 19:53:02.500000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 28 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
||
"0 12751 NaN 2 False 1 True \n",
|
||
"1 12825 NaN 2 False 2 True \n",
|
||
"2 11261 NaN 2 False 1 True \n",
|
||
"3 13071 NaN 2 False 2 True \n",
|
||
"4 653061 NaN 10 False 2 True \n",
|
||
"\n",
|
||
" opt_in structure_id profession language ... average_ticket_basket \\\n",
|
||
"0 True NaN NaN NaN ... NaN \n",
|
||
"1 True NaN NaN NaN ... NaN \n",
|
||
"2 True NaN NaN NaN ... NaN \n",
|
||
"3 True NaN NaN NaN ... NaN \n",
|
||
"4 False NaN NaN NaN ... NaN \n",
|
||
"\n",
|
||
" total_price purchase_count first_buying_date country age tenant_id \\\n",
|
||
"0 NaN 0 NaT fr NaN 1311 \n",
|
||
"1 NaN 0 NaT fr NaN 1311 \n",
|
||
"2 NaN 0 NaT fr NaN 1311 \n",
|
||
"3 NaN 0 NaT fr NaN 1311 \n",
|
||
"4 NaN 0 NaT NaN NaN 1311 \n",
|
||
"\n",
|
||
" nb_campaigns nb_campaigns_opened time_to_open \n",
|
||
"0 NaN NaN NaT \n",
|
||
"1 NaN NaN NaT \n",
|
||
"2 NaN NaN NaT \n",
|
||
"3 NaN NaN NaT \n",
|
||
"4 80.0 2.0 0 days 19:53:02.500000 \n",
|
||
"\n",
|
||
"[5 rows x 28 columns]"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_customer.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>birthdate</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>profession</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>mcp_contact_id</th>\n",
|
||
" <th>last_buying_date</th>\n",
|
||
" <th>max_price</th>\n",
|
||
" <th>ticket_sum</th>\n",
|
||
" <th>average_price</th>\n",
|
||
" <th>fidelity</th>\n",
|
||
" <th>average_purchase_delay</th>\n",
|
||
" <th>average_price_basket</th>\n",
|
||
" <th>average_ticket_basket</th>\n",
|
||
" <th>total_price</th>\n",
|
||
" <th>purchase_count</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>58201</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2023-11-08 03:20:07</td>\n",
|
||
" <td>45.0</td>\n",
|
||
" <td>1254775</td>\n",
|
||
" <td>7.030122</td>\n",
|
||
" <td>330831</td>\n",
|
||
" <td>-67.790969</td>\n",
|
||
" <td>13.75153</td>\n",
|
||
" <td>1.956087</td>\n",
|
||
" <td>8821221.5</td>\n",
|
||
" <td>641472</td>\n",
|
||
" <td>2013-06-10 10:37:58+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
||
"58201 1 NaN 2 False 2 True \n",
|
||
"\n",
|
||
" opt_in structure_id profession language mcp_contact_id \\\n",
|
||
"58201 False NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" last_buying_date max_price ticket_sum average_price fidelity \\\n",
|
||
"58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n",
|
||
"\n",
|
||
" average_purchase_delay average_price_basket average_ticket_basket \\\n",
|
||
"58201 -67.790969 13.75153 1.956087 \n",
|
||
"\n",
|
||
" total_price purchase_count first_buying_date country age \\\n",
|
||
"58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n",
|
||
"\n",
|
||
" tenant_id nb_campaigns nb_campaigns_opened time_to_open \n",
|
||
"58201 1311 NaN NaN NaT "
|
||
]
|
||
},
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.set_option('display.max_columns', None)\n",
|
||
"\n",
|
||
"\n",
|
||
"df1_customer[df1_customer['customer_id'] == 1]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "1e42a790-b215-4107-a969-85005da06ebd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec KPI liés au comportement d'achat\n",
|
||
"# df1_customer_product = pd.merge(df1_products_purchased_reduced, df1_products_purchased, on = 'customer_id', how = 'outer')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# df1_customer_product"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|