{
"cells": [
{
"cell_type": "markdown",
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re"
]
},
{
"cell_type": "markdown",
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
"metadata": {},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15815/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n"
]
}
],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
"metadata": {},
"source": [
"## Cleaning functions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
"metadata": {},
"source": [
"## Preprocessing"
]
},
{
"cell_type": "markdown",
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
"metadata": {},
"source": [
"## customer_plus"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_customerplus(customerplus = None):\n",
"\n",
" customerplus_copy = customerplus.copy()\n",
" \n",
" # Passage en format date\n",
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
" \n",
" # Selection des variables\n",
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n",
" return customerplus_copy\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
"metadata": {},
"outputs": [],
"source": [
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
]
},
{
"cell_type": "markdown",
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
"metadata": {},
"source": [
"## Ticket area"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
"source": [
"# Fonction de nettoyage et selection\n",
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
" # Base des tickets\n",
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
"\n",
" # Base des fournisseurs\n",
" suppliers = suppliers[['id', 'name']]\n",
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
"\n",
" # Base des types de billets\n",
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
"\n",
" # Base des achats\n",
" # Nettoyage de la date d'achat\n",
" cleaning_date(purchases, 'purchase_date')\n",
" # Selection des variables\n",
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
"\n",
" # Fusions \n",
" # Fusion avec fournisseurs\n",
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec type de tickets\n",
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec achats\n",
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
"\n",
" return ticket_information"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15815/1591303091.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
"/tmp/ipykernel_15815/1591303091.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
"/tmp/ipykernel_15815/1591303091.py:13: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
]
}
],
"source": [
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ticket_id | \n",
" product_id | \n",
" is_from_subscription | \n",
" supplier_name | \n",
" type_of_ticket_name | \n",
" children | \n",
" purchase_date | \n",
" customer_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 13070859 | \n",
" 225251 | \n",
" False | \n",
" vente en ligne | \n",
" Atelier | \n",
" pricing_formula | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" 48187 | \n",
"
\n",
" \n",
" 1 | \n",
" 13070860 | \n",
" 224914 | \n",
" False | \n",
" vente en ligne | \n",
" Atelier | \n",
" pricing_formula | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" 48187 | \n",
"
\n",
" \n",
" 2 | \n",
" 13070861 | \n",
" 224914 | \n",
" False | \n",
" vente en ligne | \n",
" Atelier | \n",
" pricing_formula | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" 48187 | \n",
"
\n",
" \n",
" 3 | \n",
" 13070862 | \n",
" 224914 | \n",
" False | \n",
" vente en ligne | \n",
" Atelier | \n",
" pricing_formula | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" 48187 | \n",
"
\n",
" \n",
" 4 | \n",
" 13070863 | \n",
" 224914 | \n",
" False | \n",
" vente en ligne | \n",
" Atelier | \n",
" pricing_formula | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" 48187 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ticket_id product_id is_from_subscription supplier_name \\\n",
"0 13070859 225251 False vente en ligne \n",
"1 13070860 224914 False vente en ligne \n",
"2 13070861 224914 False vente en ligne \n",
"3 13070862 224914 False vente en ligne \n",
"4 13070863 224914 False vente en ligne \n",
"\n",
" type_of_ticket_name children purchase_date customer_id \n",
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_ticket_information.head()"
]
},
{
"cell_type": "markdown",
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
"metadata": {},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
" # Target.csv cleaning\n",
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
" \n",
" # target_type cleaning\n",
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
" \n",
" #customer_target_mappings cleaning\n",
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
" \n",
" # Merge target et target_type\n",
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
" \n",
" # Merge\n",
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
"\n",
" return targets_full"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15815/3848597476.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
"
\n",
" \n",
" target_name | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" consentement optin mediation specialisee | \n",
" 150000 | \n",
"
\n",
" \n",
" consentement optin jeune public | \n",
" 149979 | \n",
"
\n",
" \n",
" consentement optin b2c | \n",
" 108909 | \n",
"
\n",
" \n",
" Arenametrix_bascule tel vers sib | \n",
" 35216 | \n",
"
\n",
" \n",
" consentement optout b2c | \n",
" 34523 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customer_id\n",
"target_name \n",
"consentement optin mediation specialisee 150000\n",
"consentement optin jeune public 149979\n",
"consentement optin b2c 108909\n",
"Arenametrix_bascule tel vers sib 35216\n",
"consentement optout b2c 34523"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
"
\n",
" \n",
" target_name | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Arenametrix_bascule tel vers sib | \n",
" 35216 | \n",
"
\n",
" \n",
" Autres_interet_exposition | \n",
" 1021 | \n",
"
\n",
" \n",
" COM Inscrits NL générale (historique) | \n",
" 23005 | \n",
"
\n",
" \n",
" Contacts_prenomsdoubles | \n",
" 11643 | \n",
"
\n",
" \n",
" DDCP MD Procès du Siècle | \n",
" 1684 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customer_id\n",
"target_name \n",
"Arenametrix_bascule tel vers sib 35216\n",
"Autres_interet_exposition 1021\n",
"COM Inscrits NL générale (historique) 23005\n",
"Contacts_prenomsdoubles 11643\n",
"DDCP MD Procès du Siècle 1684"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
"df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()"
]
},
{
"cell_type": "markdown",
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
"metadata": {},
"source": [
"## Campaings area"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
" # campaign_stats cleaning \n",
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
" cleaning_date(campaign_stats, 'opened_at')\n",
" cleaning_date(campaign_stats, 'sent_at')\n",
" cleaning_date(campaign_stats, 'delivered_at')\n",
" \n",
" # campaigns cleaning\n",
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
" cleaning_date(campaigns, 'campaign_sent_at')\n",
" \n",
" # Merge \n",
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
"\n",
" return campaigns_full"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
"/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
]
}
],
"source": [
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" customer_id | \n",
" opened_at | \n",
" sent_at | \n",
" delivered_at | \n",
" campaign_name | \n",
" campaign_service_id | \n",
" campaign_sent_at | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19793 | \n",
" 112597 | \n",
" NaT | \n",
" 2021-03-28 16:01:09+00:00 | \n",
" 2021-03-28 16:24:18+00:00 | \n",
" Le Mucem chez vous, gardons le lien #22 | \n",
" 404 | \n",
" 2021-03-27 23:00:00+00:00 | \n",
"
\n",
" \n",
" 1 | \n",
" 14211 | \n",
" 113666 | \n",
" NaT | \n",
" 2021-03-28 16:01:09+00:00 | \n",
" 2021-03-28 16:21:02+00:00 | \n",
" Le Mucem chez vous, gardons le lien #22 | \n",
" 404 | \n",
" 2021-03-27 23:00:00+00:00 | \n",
"
\n",
" \n",
" 2 | \n",
" 13150 | \n",
" 280561 | \n",
" NaT | \n",
" 2021-03-28 16:00:59+00:00 | \n",
" 2021-03-28 16:08:45+00:00 | \n",
" Le Mucem chez vous, gardons le lien #22 | \n",
" 404 | \n",
" 2021-03-27 23:00:00+00:00 | \n",
"
\n",
" \n",
" 3 | \n",
" 7073 | \n",
" 101007 | \n",
" 2021-03-28 18:11:06+00:00 | \n",
" 2021-03-28 16:00:59+00:00 | \n",
" 2021-03-28 16:09:47+00:00 | \n",
" Le Mucem chez vous, gardons le lien #22 | \n",
" 404 | \n",
" 2021-03-27 23:00:00+00:00 | \n",
"
\n",
" \n",
" 4 | \n",
" 5175 | \n",
" 103972 | \n",
" NaT | \n",
" 2021-03-28 16:01:06+00:00 | \n",
" 2021-03-28 16:05:03+00:00 | \n",
" Le Mucem chez vous, gardons le lien #22 | \n",
" 404 | \n",
" 2021-03-27 23:00:00+00:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id customer_id opened_at sent_at \\\n",
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
"\n",
" delivered_at campaign_name \\\n",
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"\n",
" campaign_service_id campaign_sent_at \n",
"0 404 2021-03-27 23:00:00+00:00 \n",
"1 404 2021-03-27 23:00:00+00:00 \n",
"2 404 2021-03-27 23:00:00+00:00 \n",
"3 404 2021-03-27 23:00:00+00:00 \n",
"4 404 2021-03-27 23:00:00+00:00 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_campaigns_information.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {},
"outputs": [],
"source": [
"def campaigns_kpi_function(campaigns_information = None):\n",
" # Nombre de campagnes de mails\n",
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
" # Temps d'ouverture en min moyen \n",
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
"\n",
" # Nombre de mail ouvert \n",
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
"\n",
" # Fusion des indicateurs\n",
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
"\n",
" # Remplir les NaN : nb_campaigns_opened\n",
" campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
"\n",
" # Remplir les NaT : time_to_open (??)\n",
"\n",
" return campaigns_reduced\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15815/3700263836.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
]
}
],
"source": [
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
" nb_campaigns | \n",
" nb_campaigns_opened | \n",
" time_to_open | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" 4 | \n",
" 0.0 | \n",
" NaT | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" 222 | \n",
" 124.0 | \n",
" 1 days 00:28:30.169354838 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 7 | \n",
" 7.0 | \n",
" 1 days 04:31:01.428571428 | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" 4 | \n",
" 0.0 | \n",
" NaT | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" 20 | \n",
" 0.0 | \n",
" NaT | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
"0 2 4 0.0 NaT\n",
"1 3 222 124.0 1 days 00:28:30.169354838\n",
"2 4 7 7.0 1 days 04:31:01.428571428\n",
"3 5 4 0.0 NaT\n",
"4 6 20 0.0 NaT"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_campaigns_kpi.head()"
]
},
{
"cell_type": "markdown",
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
"metadata": {},
"source": [
"## Create Products Table"
]
},
{
"cell_type": "markdown",
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
"metadata": {},
"source": [
"Some useful functions"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"directory_path = '1'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(file_name):\n",
" \"\"\"\n",
" This function returns the file from s3 storage\n",
" \"\"\"\n",
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
" \n",
" print(\"Shape : \", df.shape)\n",
" return df\n",
"\n",
"\n",
"def remove_horodates(df):\n",
" \"\"\"\n",
" this function remove horodate columns like created_at and updated_at\n",
" \"\"\"\n",
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
" return df\n",
"\n",
"\n",
"def order_columns_id(df):\n",
" \"\"\"\n",
" this function puts all id columns at the beginning in order to read the dataset easier\n",
" \"\"\"\n",
" substring = 'id'\n",
" id_columns = [col for col in df.columns if substring in col]\n",
" remaining_col = [col for col in df.columns if substring not in col]\n",
" new_order = id_columns + remaining_col\n",
" return df[new_order]\n",
"\n",
"\n",
"def process_df_2(df):\n",
" \"\"\"\n",
" This function organizes dataframe\n",
" \"\"\"\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" return df\n",
"\n",
"def load_dataset(name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" df = display_databases(name)\n",
" df = process_df_2(df)\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
"metadata": {},
"source": [
"Create theme tables"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
"metadata": {},
"outputs": [],
"source": [
"def create_products_table():\n",
" # first merge products and categories\n",
" print(\"first merge products and categories\")\n",
" products = load_dataset(\"1products.csv\")\n",
" categories = load_dataset(\"1categories.csv\")\n",
" # Drop useless columns\n",
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
"\n",
" #Merge\n",
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'id', suffixes=('_products', '_categories'))\n",
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
" \n",
" # Second merge products_theme and type of categories\n",
" print(\"Second merge products_theme and type of categories\")\n",
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'category_id' )\n",
"\n",
" # Index cleaning\n",
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
" products_theme = order_columns_id(products_theme)\n",
" return products_theme\n",
"\n",
"\n",
"def create_events_table():\n",
" # first merge events and seasons : \n",
" print(\"first merge events and seasons : \")\n",
" events = load_dataset(\"1events.csv\")\n",
" seasons = load_dataset(\"1seasons.csv\")\n",
"\n",
" # Drop useless columns\n",
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
" seasons = seasons.drop(columns = ['start_date_time'])\n",
" \n",
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
"\n",
" # Secondly merge events_theme and event_types\n",
" print(\"Secondly merge events_theme and event_types : \")\n",
" event_types = load_dataset(\"1event_types.csv\")\n",
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
" \n",
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # thirdly merge events_theme and facilities\n",
" print(\"thirdly merge events_theme and facilities : \")\n",
" facilities = load_dataset(\"1facilities.csv\")\n",
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
" \n",
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # Index cleaning\n",
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
" events_theme = order_columns_id(events_theme)\n",
" return events_theme\n",
"\n",
"\n",
"def create_representations_table():\n",
" representations = load_dataset(\"1representations.csv\")\n",
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
" 'representation_type_id'])\n",
" \n",
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
"\n",
" representations_theme = representations.merge(representations_capacity, how='left',\n",
" left_on='id', right_on='representation_id',\n",
" suffixes=('_representation', '_representation_cap'))\n",
" # index cleaning\n",
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
" representations_theme = order_columns_id(representations_theme)\n",
" return representations_theme"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first merge products and categories\n",
"File path : bdc2324-data/1/1products.csv\n",
"Shape : (94803, 14)\n",
"Number of columns : 12\n",
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Second merge products_theme and type of categories\n",
"File path : bdc2324-data/1/1type_of_categories.csv\n",
"Shape : (5, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id_products | \n",
" representation_id | \n",
" pricing_formula_id | \n",
" category_id | \n",
" products_group_id | \n",
" product_pack_id | \n",
" type_of_id | \n",
" amount | \n",
" is_full_price | \n",
" name_categories | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10682 | \n",
" 914 | \n",
" 114 | \n",
" 41 | \n",
" 10655 | \n",
" 1 | \n",
" NaN | \n",
" 9.0 | \n",
" False | \n",
" indiv activité tr | \n",
"
\n",
" \n",
" 1 | \n",
" 478 | \n",
" 273 | \n",
" 131 | \n",
" 1 | \n",
" 471 | \n",
" 1 | \n",
" 12.0 | \n",
" 9.5 | \n",
" False | \n",
" indiv entrées tp | \n",
"
\n",
" \n",
" 2 | \n",
" 20873 | \n",
" 275 | \n",
" 137 | \n",
" 1 | \n",
" 20825 | \n",
" 1 | \n",
" 12.0 | \n",
" 11.5 | \n",
" False | \n",
" indiv entrées tp | \n",
"
\n",
" \n",
" 3 | \n",
" 157142 | \n",
" 82519 | \n",
" 9 | \n",
" 5 | \n",
" 156773 | \n",
" 1 | \n",
" NaN | \n",
" 8.0 | \n",
" False | \n",
" indiv entrées tr | \n",
"
\n",
" \n",
" 4 | \n",
" 1341 | \n",
" 9 | \n",
" 93 | \n",
" 1 | \n",
" 1175 | \n",
" 1 | \n",
" 12.0 | \n",
" 8.5 | \n",
" False | \n",
" indiv entrées tp | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
"0 10655 1 NaN 9.0 False \n",
"1 471 1 12.0 9.5 False \n",
"2 20825 1 12.0 11.5 False \n",
"3 156773 1 NaN 8.0 False \n",
"4 1175 1 12.0 8.5 False \n",
"\n",
" name_categories \n",
"0 indiv activité tr \n",
"1 indiv entrées tp \n",
"2 indiv entrées tp \n",
"3 indiv entrées tr \n",
"4 indiv entrées tp "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_theme = create_products_table()\n",
"products_theme.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first merge events and seasons : \n",
"File path : bdc2324-data/1/1events.csv\n",
"Shape : (1232, 12)\n",
"Number of columns : 10\n",
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1seasons.csv\n",
"Shape : (13, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
"Secondly merge events_theme and event_types : \n",
"File path : bdc2324-data/1/1event_types.csv\n",
"Shape : (9, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
"thirdly merge events_theme and facilities : \n",
"File path : bdc2324-data/1/1facilities.csv\n",
"Shape : (2, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event_id | \n",
" season_id | \n",
" facility_id | \n",
" event_type_id | \n",
" event_type_key_id | \n",
" facility_key_id | \n",
" street_id | \n",
" name_events | \n",
" name_seasons | \n",
" name_event_types | \n",
" name_facilities | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 192 | \n",
" 16 | \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" frontières | \n",
" 2018 | \n",
" spectacle vivant | \n",
" mucem | \n",
"
\n",
" \n",
" 1 | \n",
" 30329 | \n",
" 2767 | \n",
" 1 | \n",
" 5 | \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
" visite guidée une autre histoire du monde (1h00) | \n",
" 2023 | \n",
" offre muséale groupe | \n",
" mucem | \n",
"
\n",
" \n",
" 2 | \n",
" 161 | \n",
" 16 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" visite contée les chercheurs d'or indiv | \n",
" 2018 | \n",
" offre muséale individuel | \n",
" mucem | \n",
"
\n",
" \n",
" 3 | \n",
" 5957 | \n",
" 582 | \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" we dreamt of utopia and we woke up screaming. | \n",
" 2021 | \n",
" spectacle vivant | \n",
" mucem | \n",
"
\n",
" \n",
" 4 | \n",
" 8337 | \n",
" 582 | \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" jeff koons épisodes 4 | \n",
" 2021 | \n",
" spectacle vivant | \n",
" mucem | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
"0 192 16 1 4 4 \n",
"1 30329 2767 1 5 5 \n",
"2 161 16 1 2 2 \n",
"3 5957 582 1 4 4 \n",
"4 8337 582 1 4 4 \n",
"\n",
" facility_key_id street_id \\\n",
"0 1 1 \n",
"1 1 1 \n",
"2 1 1 \n",
"3 1 1 \n",
"4 1 1 \n",
"\n",
" name_events name_seasons \\\n",
"0 frontières 2018 \n",
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
"2 visite contée les chercheurs d'or indiv 2018 \n",
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
"4 jeff koons épisodes 4 2021 \n",
"\n",
" name_event_types name_facilities \n",
"0 spectacle vivant mucem \n",
"1 offre muséale groupe mucem \n",
"2 offre muséale individuel mucem \n",
"3 spectacle vivant mucem \n",
"4 spectacle vivant mucem "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events_theme= create_events_table()\n",
"events_theme.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1representations.csv\n",
"Shape : (36095, 16)\n",
"Number of columns : 14\n",
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
"Shape : (65241, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
" 'max_filling'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event_id | \n",
" id_representation_cap | \n",
" representation_id | \n",
" category_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 12384 | \n",
" 123058 | \n",
" 84820 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 37 | \n",
" 2514 | \n",
" 269 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 37 | \n",
" 384 | \n",
" 269 | \n",
" 5 | \n",
"
\n",
" \n",
" 3 | \n",
" 37 | \n",
" 2515 | \n",
" 269 | \n",
" 10 | \n",
"
\n",
" \n",
" 4 | \n",
" 37 | \n",
" 383 | \n",
" 269 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event_id id_representation_cap representation_id category_id\n",
"0 12384 123058 84820 2\n",
"1 37 2514 269 2\n",
"2 37 384 269 5\n",
"3 37 2515 269 10\n",
"4 37 383 269 1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"representation_theme = create_representations_table()\n",
"representation_theme.head()"
]
},
{
"cell_type": "markdown",
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
"metadata": {},
"source": [
"Create uniform product database "
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
"metadata": {},
"outputs": [],
"source": [
"def uniform_product_df():\n",
" \"\"\"\n",
" This function returns the uniform product dataset\n",
" \"\"\"\n",
" print(\"Products theme columns : \", products_theme.columns)\n",
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
"\n",
" products_global = products_theme.merge(representation_theme, how='left',\n",
" on= [\"representation_id\", \"category_id\"])\n",
" \n",
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
" suffixes = (\"_representation\", \"_event\"))\n",
" \n",
" products_global = order_columns_id(products_global)\n",
"\n",
" # remove useless columns \n",
" products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n",
" return products_global"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
" 'is_full_price', 'name_categories'],\n",
" dtype='object')\n",
"\n",
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
" 'category_id'],\n",
" dtype='object')\n",
"\n",
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id_products | \n",
" representation_id | \n",
" pricing_formula_id | \n",
" category_id | \n",
" products_group_id | \n",
" product_pack_id | \n",
" event_id | \n",
" id_representation_cap | \n",
" season_id | \n",
" facility_id | \n",
" ... | \n",
" event_type_key_id | \n",
" facility_key_id | \n",
" street_id | \n",
" amount | \n",
" is_full_price | \n",
" name_categories | \n",
" name_events | \n",
" name_seasons | \n",
" name_event_types | \n",
" name_facilities | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10682 | \n",
" 914 | \n",
" 114 | \n",
" 41 | \n",
" 10655 | \n",
" 1 | \n",
" 132 | \n",
" 8789 | \n",
" 4 | \n",
" 1 | \n",
" ... | \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
" 9.0 | \n",
" False | \n",
" indiv activité tr | \n",
" visite-jeu \"le classico des minots\" (1h30) | \n",
" 2017 | \n",
" offre muséale individuel | \n",
" mucem | \n",
"
\n",
" \n",
" 1 | \n",
" 478 | \n",
" 273 | \n",
" 131 | \n",
" 1 | \n",
" 471 | \n",
" 1 | \n",
" 37 | \n",
" 390 | \n",
" 2 | \n",
" 1 | \n",
" ... | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 9.5 | \n",
" False | \n",
" indiv entrées tp | \n",
" billet mucem picasso | \n",
" 2016 | \n",
" offre muséale individuel | \n",
" mucem | \n",
"
\n",
" \n",
" 2 | \n",
" 20873 | \n",
" 275 | \n",
" 137 | \n",
" 1 | \n",
" 20825 | \n",
" 1 | \n",
" 37 | \n",
" 395 | \n",
" 2 | \n",
" 1 | \n",
" ... | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 11.5 | \n",
" False | \n",
" indiv entrées tp | \n",
" billet mucem picasso | \n",
" 2016 | \n",
" offre muséale individuel | \n",
" mucem | \n",
"
\n",
" \n",
" 3 | \n",
" 157142 | \n",
" 82519 | \n",
" 9 | \n",
" 5 | \n",
" 156773 | \n",
" 1 | \n",
" 12365 | \n",
" 120199 | \n",
" 1754 | \n",
" 1 | \n",
" ... | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" 8.0 | \n",
" False | \n",
" indiv entrées tr | \n",
" NaN | \n",
" NaN | \n",
" offre muséale individuel | \n",
" mucem | \n",
"
\n",
" \n",
" 4 | \n",
" 1341 | \n",
" 9 | \n",
" 93 | \n",
" 1 | \n",
" 1175 | \n",
" 1 | \n",
" 8 | \n",
" 21 | \n",
" 4 | \n",
" 1 | \n",
" ... | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 8.5 | \n",
" False | \n",
" indiv entrées tp | \n",
" non défini | \n",
" 2017 | \n",
" non défini | \n",
" mucem | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 21 columns
\n",
"
"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id event_id id_representation_cap \\\n",
"0 10655 1 132 8789 \n",
"1 471 1 37 390 \n",
"2 20825 1 37 395 \n",
"3 156773 1 12365 120199 \n",
"4 1175 1 8 21 \n",
"\n",
" season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
"0 4 1 ... 5 1 1 \n",
"1 2 1 ... 2 1 1 \n",
"2 2 1 ... 2 1 1 \n",
"3 1754 1 ... 4 1 1 \n",
"4 4 1 ... 6 1 1 \n",
"\n",
" amount is_full_price name_categories \\\n",
"0 9.0 False indiv activité tr \n",
"1 9.5 False indiv entrées tp \n",
"2 11.5 False indiv entrées tp \n",
"3 8.0 False indiv entrées tr \n",
"4 8.5 False indiv entrées tp \n",
"\n",
" name_events name_seasons \\\n",
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
"1 billet mucem picasso 2016 \n",
"2 billet mucem picasso 2016 \n",
"3 NaN NaN \n",
"4 non défini 2017 \n",
"\n",
" name_event_types name_facilities \n",
"0 offre muséale individuel mucem \n",
"1 offre muséale individuel mucem \n",
"2 offre muséale individuel mucem \n",
"3 offre muséale individuel mucem \n",
"4 non défini mucem \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_global = uniform_product_df()\n",
"products_global.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
"metadata": {},
"outputs": [],
"source": [
"# Fusion liée au product\n",
"df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52db7bcb-3fb7-48e5-b612-4e22bdab4a94",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
"metadata": {},
"source": [
"### KPI tickets"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "665a5925-9c0e-425a-8f11-c33a0a9ec444",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ticket_id', 'product_id', 'is_from_subscription', 'supplier_name',\n",
" 'type_of_ticket_name', 'children', 'purchase_date', 'customer_id',\n",
" 'id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'event_id',\n",
" 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n",
" 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n",
" 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n",
" 'name_event_types', 'name_facilities'],\n",
" dtype='object')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_products_purchased.columns"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "b913a69e-3146-4919-b5f6-a6108532bffa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['spectacle vivant', 'offre muséale individuel', 'formule adhésion',\n",
" 'offre muséale groupe'], dtype=object)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_products_purchased['name_event_types'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "e01e8cf9-1187-4a4b-993d-b7b4321cd8f0",
"metadata": {},
"outputs": [],
"source": [
"df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "3d8b0875-b409-44ce-b688-d9d6758782d3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ticket_id | \n",
" customer_id | \n",
" event_type_id | \n",
" supplier_name | \n",
" purchase_date | \n",
" type_of_ticket_name | \n",
" amount | \n",
" children | \n",
" is_full_price | \n",
" name_event_types | \n",
" name_facilities | \n",
" name_categories | \n",
" name_events | \n",
" name_seasons | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 13070859 | \n",
" 48187 | \n",
" 4 | \n",
" vente en ligne | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" Atelier | \n",
" 8.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv prog enfant | \n",
" l'école des magiciens | \n",
" 2018 | \n",
"
\n",
" \n",
" 1 | \n",
" 13070855 | \n",
" 48187 | \n",
" 4 | \n",
" vente en ligne | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" Atelier | \n",
" 8.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv prog enfant | \n",
" l'école des magiciens | \n",
" 2018 | \n",
"
\n",
" \n",
" 2 | \n",
" 13070856 | \n",
" 48187 | \n",
" 4 | \n",
" vente en ligne | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" Atelier | \n",
" 8.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv prog enfant | \n",
" l'école des magiciens | \n",
" 2018 | \n",
"
\n",
" \n",
" 3 | \n",
" 13070857 | \n",
" 48187 | \n",
" 4 | \n",
" vente en ligne | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" Atelier | \n",
" 8.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv prog enfant | \n",
" l'école des magiciens | \n",
" 2018 | \n",
"
\n",
" \n",
" 4 | \n",
" 13070858 | \n",
" 48187 | \n",
" 4 | \n",
" vente en ligne | \n",
" 2018-12-28 14:47:50+00:00 | \n",
" Atelier | \n",
" 8.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv prog enfant | \n",
" l'école des magiciens | \n",
" 2018 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1826667 | \n",
" 18643494 | \n",
" 81 | \n",
" 4 | \n",
" vad | \n",
" 2022-08-02 12:18:16+00:00 | \n",
" Billet en nombre | \n",
" 11.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" en nb entrées tr | \n",
" NaN | \n",
" 2022 | \n",
"
\n",
" \n",
" 1826668 | \n",
" 18643495 | \n",
" 81 | \n",
" 4 | \n",
" vad | \n",
" 2022-08-02 12:18:16+00:00 | \n",
" Billet en nombre | \n",
" 11.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" en nb entrées tr | \n",
" NaN | \n",
" 2022 | \n",
"
\n",
" \n",
" 1826669 | \n",
" 18643496 | \n",
" 81 | \n",
" 4 | \n",
" vad | \n",
" 2022-08-02 12:18:16+00:00 | \n",
" Billet en nombre | \n",
" 11.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" en nb entrées tr | \n",
" NaN | \n",
" 2022 | \n",
"
\n",
" \n",
" 1826670 | \n",
" 18643497 | \n",
" 81 | \n",
" 4 | \n",
" vad | \n",
" 2022-08-02 12:18:16+00:00 | \n",
" Billet en nombre | \n",
" 11.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" en nb entrées tr | \n",
" NaN | \n",
" 2022 | \n",
"
\n",
" \n",
" 1826671 | \n",
" 19853111 | \n",
" 62763 | \n",
" 4 | \n",
" vad | \n",
" 2022-11-04 14:25:42+00:00 | \n",
" Billet en nombre | \n",
" 0.0 | \n",
" pricing_formula | \n",
" False | \n",
" spectacle vivant | \n",
" mucem | \n",
" indiv entrées gr | \n",
" NaN | \n",
" 2022 | \n",
"
\n",
" \n",
"
\n",
"
1826672 rows × 14 columns
\n",
"
"
],
"text/plain": [
" ticket_id customer_id event_type_id supplier_name \\\n",
"0 13070859 48187 4 vente en ligne \n",
"1 13070855 48187 4 vente en ligne \n",
"2 13070856 48187 4 vente en ligne \n",
"3 13070857 48187 4 vente en ligne \n",
"4 13070858 48187 4 vente en ligne \n",
"... ... ... ... ... \n",
"1826667 18643494 81 4 vad \n",
"1826668 18643495 81 4 vad \n",
"1826669 18643496 81 4 vad \n",
"1826670 18643497 81 4 vad \n",
"1826671 19853111 62763 4 vad \n",
"\n",
" purchase_date type_of_ticket_name amount \\\n",
"0 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"1 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"2 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"3 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"4 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"... ... ... ... \n",
"1826667 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
"1826668 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
"1826669 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
"1826670 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n",
"1826671 2022-11-04 14:25:42+00:00 Billet en nombre 0.0 \n",
"\n",
" children is_full_price name_event_types name_facilities \\\n",
"0 pricing_formula False spectacle vivant mucem \n",
"1 pricing_formula False spectacle vivant mucem \n",
"2 pricing_formula False spectacle vivant mucem \n",
"3 pricing_formula False spectacle vivant mucem \n",
"4 pricing_formula False spectacle vivant mucem \n",
"... ... ... ... ... \n",
"1826667 pricing_formula False spectacle vivant mucem \n",
"1826668 pricing_formula False spectacle vivant mucem \n",
"1826669 pricing_formula False spectacle vivant mucem \n",
"1826670 pricing_formula False spectacle vivant mucem \n",
"1826671 pricing_formula False spectacle vivant mucem \n",
"\n",
" name_categories name_events name_seasons \n",
"0 indiv prog enfant l'école des magiciens 2018 \n",
"1 indiv prog enfant l'école des magiciens 2018 \n",
"2 indiv prog enfant l'école des magiciens 2018 \n",
"3 indiv prog enfant l'école des magiciens 2018 \n",
"4 indiv prog enfant l'école des magiciens 2018 \n",
"... ... ... ... \n",
"1826667 en nb entrées tr NaN 2022 \n",
"1826668 en nb entrées tr NaN 2022 \n",
"1826669 en nb entrées tr NaN 2022 \n",
"1826670 en nb entrées tr NaN 2022 \n",
"1826671 indiv entrées gr NaN 2022 \n",
"\n",
"[1826672 rows x 14 columns]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Importance des suppliers\n",
"df1_products_purchased_reduced"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe",
"metadata": {},
"outputs": [],
"source": [
"# Nombre de client assistant à plus de 2 type d'événement\n",
"nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
"metadata": {},
"outputs": [],
"source": [
"def tickets_kpi_function(tickets_information = None):\n",
" tickets_information_copy = tickets_information.copy()\n",
" tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n",
" tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n",
" .groupby([ 'customer_id']) # 'event_type_id',\n",
" .agg({'ticket_id': 'count', \n",
" 'amount' : 'sum',\n",
" 'supplier_name': 'nunique',\n",
" 'purchase_date_max' : 'max',\n",
" 'purchase_date' : 'min'})\n",
" .reset_index()\n",
" )\n",
" \n",
" tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n",
" 'amount' : 'total_amount',\n",
" 'supplier_name' : 'nb_suppliers', \n",
" 'purchase_date' : 'purchase_date_min'}, inplace = True)\n",
" \n",
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
" \n",
" return tickets_kpi\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
" nb_tickets | \n",
" total_amount | \n",
" nb_suppliers | \n",
" purchase_date_max | \n",
" purchase_date_min | \n",
" time_between_purchase | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1256574 | \n",
" 8830567.5 | \n",
" 7 | \n",
" 2023-11-08 15:59:45+00:00 | \n",
" 2013-06-10 10:37:58+00:00 | \n",
" 3803 days 05:21:47 | \n",
"
\n",
" \n",
" 3615 | \n",
" 6733 | \n",
" 35527 | \n",
" 1188.0 | \n",
" 4 | \n",
" 2023-11-03 09:42:40+00:00 | \n",
" 2015-09-09 13:48:38+00:00 | \n",
" 2976 days 19:54:02 | \n",
"
\n",
" \n",
" 39 | \n",
" 41 | \n",
" 16263 | \n",
" 37642.0 | \n",
" 6 | \n",
" 2023-10-25 09:13:16+00:00 | \n",
" 2014-01-23 16:56:57+00:00 | \n",
" 3561 days 16:16:19 | \n",
"
\n",
" \n",
" 11 | \n",
" 12 | \n",
" 5871 | \n",
" 38767.0 | \n",
" 2 | \n",
" 2023-11-04 13:46:59+00:00 | \n",
" 2018-04-04 07:46:31+00:00 | \n",
" 2040 days 06:00:28 | \n",
"
\n",
" \n",
" 32809 | \n",
" 63488 | \n",
" 5851 | \n",
" 64350.0 | \n",
" 1 | \n",
" 2022-08-25 13:08:38+00:00 | \n",
" 2020-08-18 08:32:57+00:00 | \n",
" 737 days 04:35:41 | \n",
"
\n",
" \n",
" 3708 | \n",
" 6916 | \n",
" 5482 | \n",
" 51489.5 | \n",
" 2 | \n",
" 2021-08-26 12:49:17+00:00 | \n",
" 2018-03-26 11:13:43+00:00 | \n",
" 1249 days 01:35:34 | \n",
"
\n",
" \n",
" 32616 | \n",
" 63194 | \n",
" 4507 | \n",
" 13232.0 | \n",
" 3 | \n",
" 2022-09-07 12:55:33+00:00 | \n",
" 2017-11-28 13:52:15+00:00 | \n",
" 1743 days 23:03:18 | \n",
"
\n",
" \n",
" 78 | \n",
" 81 | \n",
" 3562 | \n",
" 38746.0 | \n",
" 1 | \n",
" 2022-08-30 11:51:34+00:00 | \n",
" 2017-01-05 13:04:58+00:00 | \n",
" 2062 days 22:46:36 | \n",
"
\n",
" \n",
" 35295 | \n",
" 84002 | \n",
" 3403 | \n",
" 19830.0 | \n",
" 4 | \n",
" 2023-11-06 15:59:22+00:00 | \n",
" 2021-05-28 10:22:33+00:00 | \n",
" 892 days 05:36:49 | \n",
"
\n",
" \n",
" 3377 | \n",
" 5618 | \n",
" 3294 | \n",
" 31684.5 | \n",
" 1 | \n",
" 2022-02-24 07:47:20+00:00 | \n",
" 2018-10-25 11:04:24+00:00 | \n",
" 1217 days 20:42:56 | \n",
"
\n",
" \n",
" 30011 | \n",
" 59259 | \n",
" 2591 | \n",
" 4350.0 | \n",
" 3 | \n",
" 2023-06-12 14:05:19+00:00 | \n",
" 2019-11-25 08:52:48+00:00 | \n",
" 1295 days 05:12:31 | \n",
"
\n",
" \n",
" 34937 | \n",
" 74876 | \n",
" 2571 | \n",
" 2600.0 | \n",
" 2 | \n",
" 2023-10-02 08:13:05+00:00 | \n",
" 2018-02-08 12:54:01+00:00 | \n",
" 2061 days 19:19:04 | \n",
"
\n",
" \n",
" 270 | \n",
" 295 | \n",
" 2570 | \n",
" 17678.5 | \n",
" 6 | \n",
" 2023-10-16 10:19:22+00:00 | \n",
" 2014-01-24 15:16:17+00:00 | \n",
" 3551 days 19:03:05 | \n",
"
\n",
" \n",
" 866 | \n",
" 1221 | \n",
" 2320 | \n",
" 9652.0 | \n",
" 2 | \n",
" 2022-09-19 12:55:15+00:00 | \n",
" 2017-03-29 08:00:09+00:00 | \n",
" 2000 days 04:55:06 | \n",
"
\n",
" \n",
" 1022 | \n",
" 1429 | \n",
" 2249 | \n",
" 3500.0 | \n",
" 4 | \n",
" 2023-11-06 08:30:37+00:00 | \n",
" 2014-12-03 14:56:38+00:00 | \n",
" 3259 days 17:33:59 | \n",
"
\n",
" \n",
" 3922 | \n",
" 7249 | \n",
" 1827 | \n",
" 13385.0 | \n",
" 1 | \n",
" 2021-10-26 12:28:40+00:00 | \n",
" 2019-05-07 12:34:56+00:00 | \n",
" 902 days 23:53:44 | \n",
"
\n",
" \n",
" 54425 | \n",
" 1070539 | \n",
" 1800 | \n",
" 19800.0 | \n",
" 1 | \n",
" 2022-07-25 12:49:27+00:00 | \n",
" 2022-05-02 16:09:03+00:00 | \n",
" 83 days 20:40:24 | \n",
"
\n",
" \n",
" 69520 | \n",
" 1216801 | \n",
" 1623 | \n",
" 12562.0 | \n",
" 2 | \n",
" 2023-09-29 16:34:38+00:00 | \n",
" 2023-06-16 14:16:04+00:00 | \n",
" 105 days 02:18:34 | \n",
"
\n",
" \n",
" 30056 | \n",
" 59330 | \n",
" 1551 | \n",
" 0.0 | \n",
" 1 | \n",
" 2023-11-06 10:22:14+00:00 | \n",
" 2018-02-02 08:53:51+00:00 | \n",
" 2103 days 01:28:23 | \n",
"
\n",
" \n",
" 3243 | \n",
" 5441 | \n",
" 1544 | \n",
" 14133.0 | \n",
" 2 | \n",
" 2022-09-22 08:21:47+00:00 | \n",
" 2017-12-14 12:50:23+00:00 | \n",
" 1742 days 19:31:24 | \n",
"
\n",
" \n",
" 55195 | \n",
" 1084435 | \n",
" 1500 | \n",
" 16500.0 | \n",
" 1 | \n",
" 2022-09-27 14:32:13+00:00 | \n",
" 2022-05-18 08:04:41+00:00 | \n",
" 132 days 06:27:32 | \n",
"
\n",
" \n",
" 28983 | \n",
" 57816 | \n",
" 1485 | \n",
" 0.0 | \n",
" 2 | \n",
" 2023-05-22 07:30:55+00:00 | \n",
" 2019-01-21 14:19:18+00:00 | \n",
" 1581 days 17:11:37 | \n",
"
\n",
" \n",
" 2231 | \n",
" 2942 | \n",
" 1307 | \n",
" 100.0 | \n",
" 2 | \n",
" 2023-06-29 09:33:58+00:00 | \n",
" 2017-10-25 15:06:58+00:00 | \n",
" 2072 days 18:27:00 | \n",
"
\n",
" \n",
" 23 | \n",
" 24 | \n",
" 1266 | \n",
" 0.0 | \n",
" 2 | \n",
" 2023-10-19 07:20:48+00:00 | \n",
" 2015-09-30 16:07:52+00:00 | \n",
" 2940 days 15:12:56 | \n",
"
\n",
" \n",
" 4513 | \n",
" 9592 | \n",
" 1211 | \n",
" 62.0 | \n",
" 4 | \n",
" 2023-10-17 09:39:40+00:00 | \n",
" 2018-02-25 07:17:19+00:00 | \n",
" 2060 days 02:22:21 | \n",
"
\n",
" \n",
" 2936 | \n",
" 5059 | \n",
" 1186 | \n",
" 6308.0 | \n",
" 3 | \n",
" 2023-05-22 13:41:22+00:00 | \n",
" 2018-02-01 11:16:51+00:00 | \n",
" 1936 days 02:24:31 | \n",
"
\n",
" \n",
" 11484 | \n",
" 25100 | \n",
" 1123 | \n",
" 0.0 | \n",
" 1 | \n",
" 2021-07-13 07:39:57+00:00 | \n",
" 2015-12-21 15:38:05+00:00 | \n",
" 2030 days 16:01:52 | \n",
"
\n",
" \n",
" 934 | \n",
" 1326 | \n",
" 1098 | \n",
" 798.0 | \n",
" 3 | \n",
" 2023-02-01 08:39:45+00:00 | \n",
" 2018-02-13 13:13:48+00:00 | \n",
" 1813 days 19:25:57 | \n",
"
\n",
" \n",
" 30156 | \n",
" 59490 | \n",
" 1088 | \n",
" 0.0 | \n",
" 1 | \n",
" 2023-10-05 08:23:50+00:00 | \n",
" 2019-12-06 12:59:20+00:00 | \n",
" 1398 days 19:24:30 | \n",
"
\n",
" \n",
" 36478 | \n",
" 251268 | \n",
" 1086 | \n",
" 0.0 | \n",
" 2 | \n",
" 2023-06-30 07:22:46+00:00 | \n",
" 2018-02-02 09:06:22+00:00 | \n",
" 1973 days 22:16:24 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customer_id nb_tickets total_amount nb_suppliers \\\n",
"0 1 1256574 8830567.5 7 \n",
"3615 6733 35527 1188.0 4 \n",
"39 41 16263 37642.0 6 \n",
"11 12 5871 38767.0 2 \n",
"32809 63488 5851 64350.0 1 \n",
"3708 6916 5482 51489.5 2 \n",
"32616 63194 4507 13232.0 3 \n",
"78 81 3562 38746.0 1 \n",
"35295 84002 3403 19830.0 4 \n",
"3377 5618 3294 31684.5 1 \n",
"30011 59259 2591 4350.0 3 \n",
"34937 74876 2571 2600.0 2 \n",
"270 295 2570 17678.5 6 \n",
"866 1221 2320 9652.0 2 \n",
"1022 1429 2249 3500.0 4 \n",
"3922 7249 1827 13385.0 1 \n",
"54425 1070539 1800 19800.0 1 \n",
"69520 1216801 1623 12562.0 2 \n",
"30056 59330 1551 0.0 1 \n",
"3243 5441 1544 14133.0 2 \n",
"55195 1084435 1500 16500.0 1 \n",
"28983 57816 1485 0.0 2 \n",
"2231 2942 1307 100.0 2 \n",
"23 24 1266 0.0 2 \n",
"4513 9592 1211 62.0 4 \n",
"2936 5059 1186 6308.0 3 \n",
"11484 25100 1123 0.0 1 \n",
"934 1326 1098 798.0 3 \n",
"30156 59490 1088 0.0 1 \n",
"36478 251268 1086 0.0 2 \n",
"\n",
" purchase_date_max purchase_date_min \\\n",
"0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n",
"3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n",
"39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n",
"11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n",
"32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n",
"3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n",
"32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n",
"78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n",
"35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n",
"3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n",
"30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n",
"34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n",
"270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n",
"866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n",
"1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n",
"3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n",
"54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n",
"69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n",
"30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n",
"3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n",
"55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n",
"28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n",
"2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n",
"23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n",
"4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n",
"2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n",
"11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n",
"934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n",
"30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n",
"36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n",
"\n",
" time_between_purchase \n",
"0 3803 days 05:21:47 \n",
"3615 2976 days 19:54:02 \n",
"39 3561 days 16:16:19 \n",
"11 2040 days 06:00:28 \n",
"32809 737 days 04:35:41 \n",
"3708 1249 days 01:35:34 \n",
"32616 1743 days 23:03:18 \n",
"78 2062 days 22:46:36 \n",
"35295 892 days 05:36:49 \n",
"3377 1217 days 20:42:56 \n",
"30011 1295 days 05:12:31 \n",
"34937 2061 days 19:19:04 \n",
"270 3551 days 19:03:05 \n",
"866 2000 days 04:55:06 \n",
"1022 3259 days 17:33:59 \n",
"3922 902 days 23:53:44 \n",
"54425 83 days 20:40:24 \n",
"69520 105 days 02:18:34 \n",
"30056 2103 days 01:28:23 \n",
"3243 1742 days 19:31:24 \n",
"55195 132 days 06:27:32 \n",
"28983 1581 days 17:11:37 \n",
"2231 2072 days 18:27:00 \n",
"23 2940 days 15:12:56 \n",
"4513 2060 days 02:22:21 \n",
"2936 1936 days 02:24:31 \n",
"11484 2030 days 16:01:52 \n",
"934 1813 days 19:25:57 \n",
"30156 1398 days 19:24:30 \n",
"36478 1973 days 22:16:24 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)"
]
},
{
"cell_type": "markdown",
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
"metadata": {},
"source": [
"# Fusion des bases locales"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec KPI liés au customer\n",
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "9740d64a-e5eb-4967-a534-ca6177546465",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
" birthdate | \n",
" street_id | \n",
" is_partner | \n",
" gender | \n",
" is_email_true | \n",
" opt_in | \n",
" structure_id | \n",
" profession | \n",
" language | \n",
" ... | \n",
" average_ticket_basket | \n",
" total_price | \n",
" purchase_count | \n",
" first_buying_date | \n",
" country | \n",
" age | \n",
" tenant_id | \n",
" nb_campaigns | \n",
" nb_campaigns_opened | \n",
" time_to_open | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 12751 | \n",
" NaN | \n",
" 2 | \n",
" False | \n",
" 1 | \n",
" True | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaT | \n",
" fr | \n",
" NaN | \n",
" 1311 | \n",
" NaN | \n",
" NaN | \n",
" NaT | \n",
"
\n",
" \n",
" 1 | \n",
" 12825 | \n",
" NaN | \n",
" 2 | \n",
" False | \n",
" 2 | \n",
" True | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaT | \n",
" fr | \n",
" NaN | \n",
" 1311 | \n",
" NaN | \n",
" NaN | \n",
" NaT | \n",
"
\n",
" \n",
" 2 | \n",
" 11261 | \n",
" NaN | \n",
" 2 | \n",
" False | \n",
" 1 | \n",
" True | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaT | \n",
" fr | \n",
" NaN | \n",
" 1311 | \n",
" NaN | \n",
" NaN | \n",
" NaT | \n",
"
\n",
" \n",
" 3 | \n",
" 13071 | \n",
" NaN | \n",
" 2 | \n",
" False | \n",
" 2 | \n",
" True | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaT | \n",
" fr | \n",
" NaN | \n",
" 1311 | \n",
" NaN | \n",
" NaN | \n",
" NaT | \n",
"
\n",
" \n",
" 4 | \n",
" 653061 | \n",
" NaN | \n",
" 10 | \n",
" False | \n",
" 2 | \n",
" True | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaT | \n",
" NaN | \n",
" NaN | \n",
" 1311 | \n",
" 80.0 | \n",
" 2.0 | \n",
" 0 days 19:53:02.500000 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 28 columns
\n",
"
"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
" opt_in structure_id profession language ... average_ticket_basket \\\n",
"0 True NaN NaN NaN ... NaN \n",
"1 True NaN NaN NaN ... NaN \n",
"2 True NaN NaN NaN ... NaN \n",
"3 True NaN NaN NaN ... NaN \n",
"4 False NaN NaN NaN ... NaN \n",
"\n",
" total_price purchase_count first_buying_date country age tenant_id \\\n",
"0 NaN 0 NaT fr NaN 1311 \n",
"1 NaN 0 NaT fr NaN 1311 \n",
"2 NaN 0 NaT fr NaN 1311 \n",
"3 NaN 0 NaT fr NaN 1311 \n",
"4 NaN 0 NaT NaN NaN 1311 \n",
"\n",
" nb_campaigns nb_campaigns_opened time_to_open \n",
"0 NaN NaN NaT \n",
"1 NaN NaN NaT \n",
"2 NaN NaN NaT \n",
"3 NaN NaN NaT \n",
"4 80.0 2.0 0 days 19:53:02.500000 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_customer.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
" birthdate | \n",
" street_id | \n",
" is_partner | \n",
" gender | \n",
" is_email_true | \n",
" opt_in | \n",
" structure_id | \n",
" profession | \n",
" language | \n",
" mcp_contact_id | \n",
" last_buying_date | \n",
" max_price | \n",
" ticket_sum | \n",
" average_price | \n",
" fidelity | \n",
" average_purchase_delay | \n",
" average_price_basket | \n",
" average_ticket_basket | \n",
" total_price | \n",
" purchase_count | \n",
" first_buying_date | \n",
" country | \n",
" age | \n",
" tenant_id | \n",
" nb_campaigns | \n",
" nb_campaigns_opened | \n",
" time_to_open | \n",
"
\n",
" \n",
" \n",
" \n",
" 58201 | \n",
" 1 | \n",
" NaN | \n",
" 2 | \n",
" False | \n",
" 2 | \n",
" True | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 2023-11-08 03:20:07 | \n",
" 45.0 | \n",
" 1254775 | \n",
" 7.030122 | \n",
" 330831 | \n",
" -67.790969 | \n",
" 13.75153 | \n",
" 1.956087 | \n",
" 8821221.5 | \n",
" 641472 | \n",
" 2013-06-10 10:37:58+00:00 | \n",
" fr | \n",
" NaN | \n",
" 1311 | \n",
" NaN | \n",
" NaN | \n",
" NaT | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"58201 1 NaN 2 False 2 True \n",
"\n",
" opt_in structure_id profession language mcp_contact_id \\\n",
"58201 False NaN NaN NaN NaN \n",
"\n",
" last_buying_date max_price ticket_sum average_price fidelity \\\n",
"58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n",
"\n",
" average_purchase_delay average_price_basket average_ticket_basket \\\n",
"58201 -67.790969 13.75153 1.956087 \n",
"\n",
" total_price purchase_count first_buying_date country age \\\n",
"58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n",
"\n",
" tenant_id nb_campaigns nb_campaigns_opened time_to_open \n",
"58201 1311 NaN NaN NaT "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option('display.max_columns', None)\n",
"\n",
"\n",
"df1_customer[df1_customer['customer_id'] == 1]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "1e42a790-b215-4107-a969-85005da06ebd",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec KPI liés au comportement d'achat\n",
"# df1_customer_product = pd.merge(df1_products_purchased_reduced, df1_products_purchased, on = 'customer_id', how = 'outer')"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
"metadata": {},
"outputs": [],
"source": [
"# df1_customer_product"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}