BDC-team-1/0_Cleaning_and_merge.ipynb

3277 lines
117 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 208,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
2024-02-10 23:19:35 +01:00
"import re\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 209,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
2024-02-10 23:19:35 +01:00
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 210,
2024-02-10 23:19:35 +01:00
"id": "a9b84234-d5df-4c43-a9cd-80cfe2f1e34d",
"metadata": {},
"outputs": [],
"source": [
"# Ignore warning\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
"metadata": {},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 211,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 212,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
2024-02-10 23:19:35 +01:00
"outputs": [],
"source": [
"# loop to create dataframes from liste\n",
2024-02-10 23:19:35 +01:00
"\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
"metadata": {},
"source": [
"## Cleaning functions"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 213,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
"metadata": {},
"source": [
"## Preprocessing"
]
},
{
"cell_type": "markdown",
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
"metadata": {},
"source": [
"## customer_plus"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 214,
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_customerplus(customerplus = None):\n",
"\n",
" customerplus_copy = customerplus.copy()\n",
" \n",
" # Passage en format date\n",
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
" \n",
" # Selection des variables\n",
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n",
" return customerplus_copy\n"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 215,
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
"metadata": {},
"outputs": [],
"source": [
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
]
},
{
"cell_type": "markdown",
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
"metadata": {},
"source": [
"## Ticket area"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 216,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
"source": [
"# Fonction de nettoyage et selection\n",
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
" # Base des tickets\n",
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
"\n",
" # Base des fournisseurs\n",
" suppliers = suppliers[['id', 'name']]\n",
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
" suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n",
"\n",
" # Base des types de billets\n",
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
"\n",
" # Base des achats\n",
" # Nettoyage de la date d'achat\n",
" cleaning_date(purchases, 'purchase_date')\n",
" # Selection des variables\n",
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
"\n",
" # Fusions \n",
" # Fusion avec fournisseurs\n",
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec type de tickets\n",
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec achats\n",
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
2024-02-11 17:59:45 +01:00
" ticket_information.drop(['id'], axis = 1, inplace=True)\n",
"\n",
" return ticket_information"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 217,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
2024-02-10 23:19:35 +01:00
"outputs": [],
"source": [
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 218,
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
2024-02-11 17:59:45 +01:00
" <th>purchase_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>supplier_name</th>\n",
" <th>type_of_ticket_name</th>\n",
" <th>children</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
2024-02-11 17:59:45 +01:00
" <td>5107462</td>\n",
" <td>225251</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
2024-02-11 17:59:45 +01:00
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
2024-02-11 17:59:45 +01:00
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
2024-02-11 17:59:45 +01:00
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
2024-02-11 17:59:45 +01:00
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-11 17:59:45 +01:00
" ticket_id purchase_id product_id is_from_subscription supplier_name \\\n",
"0 13070859 5107462 225251 False vente en ligne \n",
"1 13070860 5107462 224914 False vente en ligne \n",
"2 13070861 5107462 224914 False vente en ligne \n",
"3 13070862 5107462 224914 False vente en ligne \n",
"4 13070863 5107462 224914 False vente en ligne \n",
"\n",
2024-02-05 22:10:07 +01:00
" type_of_ticket_name children purchase_date customer_id \n",
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-05 22:10:07 +01:00
"df1_ticket_information.head()"
]
},
{
"cell_type": "markdown",
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
"metadata": {},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 219,
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
" # Target.csv cleaning\n",
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
" \n",
" # target_type cleaning\n",
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
" \n",
" #customer_target_mappings cleaning\n",
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
" \n",
" # Merge target et target_type\n",
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
" \n",
" # Merge\n",
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
"\n",
" return targets_full"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 220,
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
"metadata": {},
2024-02-10 23:19:35 +01:00
"outputs": [],
"source": [
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
]
},
{
"cell_type": "markdown",
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
"metadata": {},
"source": [
"## Campaings area"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 221,
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
"metadata": {},
"outputs": [],
"source": [
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
" # campaign_stats cleaning \n",
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
" cleaning_date(campaign_stats, 'opened_at')\n",
" cleaning_date(campaign_stats, 'sent_at')\n",
" cleaning_date(campaign_stats, 'delivered_at')\n",
" \n",
" # campaigns cleaning\n",
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
" cleaning_date(campaigns, 'campaign_sent_at')\n",
" \n",
" # Merge \n",
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
"\n",
" return campaigns_full"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 222,
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
"metadata": {},
2024-02-10 23:19:35 +01:00
"outputs": [],
2024-02-05 12:51:35 +01:00
"source": [
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 223,
2024-02-05 12:51:35 +01:00
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>campaign_name</th>\n",
" <th>campaign_service_id</th>\n",
" <th>campaign_sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>19793</td>\n",
" <td>112597</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:09+00:00</td>\n",
" <td>2021-03-28 16:24:18+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14211</td>\n",
" <td>113666</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:09+00:00</td>\n",
" <td>2021-03-28 16:21:02+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13150</td>\n",
" <td>280561</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:00:59+00:00</td>\n",
" <td>2021-03-28 16:08:45+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7073</td>\n",
" <td>101007</td>\n",
" <td>2021-03-28 18:11:06+00:00</td>\n",
" <td>2021-03-28 16:00:59+00:00</td>\n",
" <td>2021-03-28 16:09:47+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5175</td>\n",
" <td>103972</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:06+00:00</td>\n",
" <td>2021-03-28 16:05:03+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-05 22:10:07 +01:00
" id customer_id opened_at sent_at \\\n",
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
"\n",
2024-02-05 22:10:07 +01:00
" delivered_at campaign_name \\\n",
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
"\n",
2024-02-05 22:10:07 +01:00
" campaign_service_id campaign_sent_at \n",
"0 404 2021-03-27 23:00:00+00:00 \n",
"1 404 2021-03-27 23:00:00+00:00 \n",
"2 404 2021-03-27 23:00:00+00:00 \n",
"3 404 2021-03-27 23:00:00+00:00 \n",
"4 404 2021-03-27 23:00:00+00:00 "
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 223,
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-05 12:51:35 +01:00
"source": [
2024-02-05 22:10:07 +01:00
"df1_campaigns_information.head()"
2024-02-05 12:51:35 +01:00
]
},
{
"cell_type": "markdown",
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
"metadata": {},
"source": [
2024-02-10 22:24:37 +01:00
"## Product area"
2024-02-05 12:51:35 +01:00
]
},
{
"cell_type": "markdown",
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
"metadata": {},
"source": [
"Some useful functions"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 224,
2024-02-05 12:51:35 +01:00
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"directory_path = '1'"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 225,
2024-02-05 12:51:35 +01:00
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(file_name):\n",
" \"\"\"\n",
" This function returns the file from s3 storage\n",
" \"\"\"\n",
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
" \n",
" print(\"Shape : \", df.shape)\n",
" return df\n",
"\n",
"\n",
"def remove_horodates(df):\n",
" \"\"\"\n",
" this function remove horodate columns like created_at and updated_at\n",
" \"\"\"\n",
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
" return df\n",
"\n",
"\n",
"def order_columns_id(df):\n",
" \"\"\"\n",
" this function puts all id columns at the beginning in order to read the dataset easier\n",
" \"\"\"\n",
" substring = 'id'\n",
" id_columns = [col for col in df.columns if substring in col]\n",
" remaining_col = [col for col in df.columns if substring not in col]\n",
" new_order = id_columns + remaining_col\n",
" return df[new_order]\n",
"\n",
"\n",
"def process_df_2(df):\n",
" \"\"\"\n",
" This function organizes dataframe\n",
" \"\"\"\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" return df\n",
"\n",
"def load_dataset(name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" df = display_databases(name)\n",
" df = process_df_2(df)\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
"metadata": {},
"source": [
"Create theme tables"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 226,
2024-02-05 12:51:35 +01:00
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
"metadata": {},
"outputs": [],
"source": [
"def create_products_table():\n",
" # first merge products and categories\n",
" print(\"first merge products and categories\")\n",
" products = load_dataset(\"1products.csv\")\n",
" categories = load_dataset(\"1categories.csv\")\n",
" # Drop useless columns\n",
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
"\n",
" #Merge\n",
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'id', suffixes=('_products', '_categories'))\n",
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
" \n",
" # Second merge products_theme and type of categories\n",
" print(\"Second merge products_theme and type of categories\")\n",
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'category_id' )\n",
"\n",
" # Index cleaning\n",
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
" products_theme = order_columns_id(products_theme)\n",
" return products_theme\n",
"\n",
"\n",
"def create_events_table():\n",
" # first merge events and seasons : \n",
" print(\"first merge events and seasons : \")\n",
" events = load_dataset(\"1events.csv\")\n",
" seasons = load_dataset(\"1seasons.csv\")\n",
"\n",
" # Drop useless columns\n",
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
" seasons = seasons.drop(columns = ['start_date_time'])\n",
" \n",
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
"\n",
" # Secondly merge events_theme and event_types\n",
" print(\"Secondly merge events_theme and event_types : \")\n",
" event_types = load_dataset(\"1event_types.csv\")\n",
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
" \n",
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # thirdly merge events_theme and facilities\n",
" print(\"thirdly merge events_theme and facilities : \")\n",
" facilities = load_dataset(\"1facilities.csv\")\n",
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
" \n",
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # Index cleaning\n",
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
" events_theme = order_columns_id(events_theme)\n",
" return events_theme\n",
"\n",
"\n",
"def create_representations_table():\n",
" representations = load_dataset(\"1representations.csv\")\n",
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
" 'representation_type_id'])\n",
" \n",
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
"\n",
" representations_theme = representations.merge(representations_capacity, how='left',\n",
" left_on='id', right_on='representation_id',\n",
" suffixes=('_representation', '_representation_cap'))\n",
" # index cleaning\n",
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
" representations_theme = order_columns_id(representations_theme)\n",
" return representations_theme"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 227,
2024-02-05 12:51:35 +01:00
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
"metadata": {},
"outputs": [
{
2024-02-05 12:51:35 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
2024-02-05 12:51:35 +01:00
"first merge products and categories\n",
"File path : bdc2324-data/1/1products.csv\n",
"Shape : (94803, 14)\n",
"Number of columns : 12\n",
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Second merge products_theme and type of categories\n",
"File path : bdc2324-data/1/1type_of_categories.csv\n",
"Shape : (5, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
]
2024-02-05 12:51:35 +01:00
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_products</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>type_of_id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_categories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
" <td>indiv activité tr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
" <td>indiv entrées tp</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
" <td>indiv entrées tp</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>indiv entrées tr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>8.5</td>\n",
" <td>False</td>\n",
" <td>indiv entrées tp</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
"0 10655 1 NaN 9.0 False \n",
"1 471 1 12.0 9.5 False \n",
"2 20825 1 12.0 11.5 False \n",
"3 156773 1 NaN 8.0 False \n",
"4 1175 1 12.0 8.5 False \n",
"\n",
" name_categories \n",
"0 indiv activité tr \n",
"1 indiv entrées tp \n",
"2 indiv entrées tp \n",
"3 indiv entrées tr \n",
"4 indiv entrées tp "
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 227,
2024-02-05 12:51:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-05 12:51:35 +01:00
"products_theme = create_products_table()\n",
"products_theme.head()"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 228,
2024-02-05 12:51:35 +01:00
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
"metadata": {},
"outputs": [
2024-02-05 12:51:35 +01:00
{
"name": "stdout",
"output_type": "stream",
"text": [
"first merge events and seasons : \n",
"File path : bdc2324-data/1/1events.csv\n",
"Shape : (1232, 12)\n",
"Number of columns : 10\n",
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1seasons.csv\n",
"Shape : (13, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
"Secondly merge events_theme and event_types : \n",
"File path : bdc2324-data/1/1event_types.csv\n",
"Shape : (9, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
"thirdly merge events_theme and facilities : \n",
"File path : bdc2324-data/1/1facilities.csv\n",
"Shape : (2, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-02-05 12:51:35 +01:00
" <th>event_id</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>event_type_id</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>street_id</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-02-05 12:51:35 +01:00
" <td>192</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>frontières</td>\n",
" <td>2018</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-02-05 12:51:35 +01:00
" <td>30329</td>\n",
" <td>2767</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
" <td>2023</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-02-05 12:51:35 +01:00
" <td>161</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>visite contée les chercheurs d'or indiv</td>\n",
" <td>2018</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-02-05 12:51:35 +01:00
" <td>5957</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
" <td>2021</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-02-05 12:51:35 +01:00
" <td>8337</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>jeff koons épisodes 4</td>\n",
" <td>2021</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
2024-02-05 12:51:35 +01:00
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
"0 192 16 1 4 4 \n",
"1 30329 2767 1 5 5 \n",
"2 161 16 1 2 2 \n",
"3 5957 582 1 4 4 \n",
"4 8337 582 1 4 4 \n",
"\n",
" facility_key_id street_id \\\n",
"0 1 1 \n",
"1 1 1 \n",
"2 1 1 \n",
"3 1 1 \n",
"4 1 1 \n",
"\n",
" name_events name_seasons \\\n",
"0 frontières 2018 \n",
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
"2 visite contée les chercheurs d'or indiv 2018 \n",
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
"4 jeff koons épisodes 4 2021 \n",
"\n",
" name_event_types name_facilities \n",
"0 spectacle vivant mucem \n",
"1 offre muséale groupe mucem \n",
"2 offre muséale individuel mucem \n",
"3 spectacle vivant mucem \n",
"4 spectacle vivant mucem "
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 228,
2024-02-05 12:51:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events_theme= create_events_table()\n",
"events_theme.head()"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 229,
2024-02-05 12:51:35 +01:00
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1representations.csv\n",
"Shape : (36095, 16)\n",
"Number of columns : 14\n",
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
"Shape : (65241, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
" 'max_filling'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_id</th>\n",
" <th>id_representation_cap</th>\n",
" <th>representation_id</th>\n",
" <th>category_id</th>\n",
" </tr>\n",
2024-02-05 12:51:35 +01:00
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-02-05 12:51:35 +01:00
" <th>0</th>\n",
" <td>12384</td>\n",
" <td>123058</td>\n",
" <td>84820</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-05 12:51:35 +01:00
" <th>1</th>\n",
" <td>37</td>\n",
" <td>2514</td>\n",
" <td>269</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-05 12:51:35 +01:00
" <th>2</th>\n",
" <td>37</td>\n",
" <td>384</td>\n",
" <td>269</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-05 12:51:35 +01:00
" <th>3</th>\n",
" <td>37</td>\n",
" <td>2515</td>\n",
" <td>269</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-05 12:51:35 +01:00
" <th>4</th>\n",
" <td>37</td>\n",
" <td>383</td>\n",
" <td>269</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-05 12:51:35 +01:00
" event_id id_representation_cap representation_id category_id\n",
"0 12384 123058 84820 2\n",
"1 37 2514 269 2\n",
"2 37 384 269 5\n",
"3 37 2515 269 10\n",
"4 37 383 269 1"
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-05 12:51:35 +01:00
"representation_theme = create_representations_table()\n",
"representation_theme.head()"
]
},
{
"cell_type": "markdown",
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
"metadata": {},
"source": [
"Create uniform product database "
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 230,
2024-02-05 12:51:35 +01:00
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
"metadata": {},
"outputs": [],
"source": [
2024-02-05 12:51:35 +01:00
"def uniform_product_df():\n",
" \"\"\"\n",
" This function returns the uniform product dataset\n",
" \"\"\"\n",
" print(\"Products theme columns : \", products_theme.columns)\n",
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
"\n",
2024-02-05 12:51:35 +01:00
" products_global = products_theme.merge(representation_theme, how='left',\n",
" on= [\"representation_id\", \"category_id\"])\n",
" \n",
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
" suffixes = (\"_representation\", \"_event\"))\n",
" \n",
" products_global = order_columns_id(products_global)\n",
"\n",
2024-02-05 12:51:35 +01:00
" # remove useless columns \n",
2024-02-07 23:28:55 +01:00
" products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n",
2024-02-05 12:51:35 +01:00
" return products_global"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 231,
2024-02-05 12:51:35 +01:00
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
"metadata": {},
"outputs": [
{
2024-02-05 12:51:35 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
2024-02-05 12:51:35 +01:00
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
" 'is_full_price', 'name_categories'],\n",
" dtype='object')\n",
"\n",
2024-02-05 12:51:35 +01:00
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
" 'category_id'],\n",
" dtype='object')\n",
"\n",
2024-02-05 12:51:35 +01:00
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
" dtype='object')\n"
]
2024-02-05 12:51:35 +01:00
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-02-05 12:51:35 +01:00
" <th>id_products</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>event_id</th>\n",
" <th>id_representation_cap</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
2024-02-07 23:28:55 +01:00
" <th>...</th>\n",
2024-02-05 12:51:35 +01:00
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>street_id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
2024-02-07 23:28:55 +01:00
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
2024-02-05 12:51:35 +01:00
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-02-05 12:51:35 +01:00
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>8789</td>\n",
" <td>4</td>\n",
2024-02-05 12:51:35 +01:00
" <td>1</td>\n",
2024-02-07 23:28:55 +01:00
" <td>...</td>\n",
2024-02-05 12:51:35 +01:00
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
2024-02-07 23:28:55 +01:00
" <td>indiv activité tr</td>\n",
" <td>visite-jeu \"le classico des minots\" (1h30)</td>\n",
" <td>2017</td>\n",
2024-02-05 12:51:35 +01:00
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-02-05 12:51:35 +01:00
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>390</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-02-07 23:28:55 +01:00
" <td>...</td>\n",
2024-02-05 12:51:35 +01:00
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
2024-02-07 23:28:55 +01:00
" <td>indiv entrées tp</td>\n",
" <td>billet mucem picasso</td>\n",
" <td>2016</td>\n",
2024-02-05 12:51:35 +01:00
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-02-05 12:51:35 +01:00
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>395</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-02-07 23:28:55 +01:00
" <td>...</td>\n",
2024-02-05 12:51:35 +01:00
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
2024-02-07 23:28:55 +01:00
" <td>indiv entrées tp</td>\n",
" <td>billet mucem picasso</td>\n",
" <td>2016</td>\n",
2024-02-05 12:51:35 +01:00
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-02-05 12:51:35 +01:00
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
2024-02-05 12:51:35 +01:00
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>12365</td>\n",
" <td>120199</td>\n",
" <td>1754</td>\n",
" <td>1</td>\n",
2024-02-07 23:28:55 +01:00
" <td>...</td>\n",
" <td>4</td>\n",
2024-02-05 12:51:35 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
2024-02-07 23:28:55 +01:00
" <td>indiv entrées tr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-05 12:51:35 +01:00
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-02-05 12:51:35 +01:00
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
2024-02-05 12:51:35 +01:00
" <td>1175</td>\n",
" <td>1</td>\n",
2024-02-05 12:51:35 +01:00
" <td>8</td>\n",
" <td>21</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
2024-02-07 23:28:55 +01:00
" <td>...</td>\n",
2024-02-05 12:51:35 +01:00
" <td>6</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-02-05 12:51:35 +01:00
" <td>8.5</td>\n",
" <td>False</td>\n",
2024-02-07 23:28:55 +01:00
" <td>indiv entrées tp</td>\n",
" <td>non défini</td>\n",
" <td>2017</td>\n",
2024-02-05 12:51:35 +01:00
" <td>non défini</td>\n",
" <td>mucem</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-02-07 23:28:55 +01:00
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
2024-02-05 12:51:35 +01:00
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
2024-02-05 12:51:35 +01:00
" products_group_id product_pack_id event_id id_representation_cap \\\n",
"0 10655 1 132 8789 \n",
"1 471 1 37 390 \n",
"2 20825 1 37 395 \n",
"3 156773 1 12365 120199 \n",
"4 1175 1 8 21 \n",
"\n",
2024-02-07 23:28:55 +01:00
" season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
"0 4 1 ... 5 1 1 \n",
"1 2 1 ... 2 1 1 \n",
"2 2 1 ... 2 1 1 \n",
"3 1754 1 ... 4 1 1 \n",
"4 4 1 ... 6 1 1 \n",
"\n",
" amount is_full_price name_categories \\\n",
"0 9.0 False indiv activité tr \n",
"1 9.5 False indiv entrées tp \n",
"2 11.5 False indiv entrées tp \n",
"3 8.0 False indiv entrées tr \n",
"4 8.5 False indiv entrées tp \n",
"\n",
" name_events name_seasons \\\n",
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
"1 billet mucem picasso 2016 \n",
"2 billet mucem picasso 2016 \n",
"3 NaN NaN \n",
"4 non défini 2017 \n",
"\n",
" name_event_types name_facilities \n",
"0 offre muséale individuel mucem \n",
"1 offre muséale individuel mucem \n",
"2 offre muséale individuel mucem \n",
"3 offre muséale individuel mucem \n",
"4 non défini mucem \n",
2024-02-05 12:51:35 +01:00
"\n",
2024-02-07 23:28:55 +01:00
"[5 rows x 21 columns]"
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 231,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-05 12:51:35 +01:00
"products_global = uniform_product_df()\n",
"products_global.head()"
]
2024-02-05 12:51:35 +01:00
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 232,
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
"metadata": {},
"outputs": [],
"source": [
"# Fusion liée au product\n",
2024-02-10 22:24:37 +01:00
"df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
"\n",
"# Selection des variables d'intérêts\n",
2024-02-12 11:08:24 +01:00
"df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'category_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
]
},
{
"cell_type": "markdown",
"id": "d7c3668a-c016-4bd0-837e-04af328ff14f",
"metadata": {},
"source": [
"# Construction des variables explicatives"
]
},
2024-02-10 19:53:59 +01:00
{
"cell_type": "markdown",
2024-02-10 22:24:37 +01:00
"id": "314f1b7f-ae48-4c6f-8469-9ce879043243",
2024-02-10 19:53:59 +01:00
"metadata": {},
"source": [
2024-02-10 22:24:37 +01:00
"## KPI campaigns"
2024-02-10 19:53:59 +01:00
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 233,
2024-02-10 22:24:37 +01:00
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {},
"outputs": [],
"source": [
"def campaigns_kpi_function(campaigns_information = None):\n",
" # Nombre de campagnes de mails\n",
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
" # Temps d'ouverture en min moyen \n",
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
"\n",
" # Nombre de mail ouvert \n",
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
"\n",
" # Fusion des indicateurs\n",
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
"\n",
" # Remplir les NaN : nb_campaigns_opened\n",
2024-02-11 17:59:45 +01:00
" campaigns_reduced['nb_campaigns_opened'].fillna(0)\n",
2024-02-10 22:24:37 +01:00
"\n",
" # Remplir les NaT : time_to_open (??)\n",
"\n",
" return campaigns_reduced\n",
" "
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 234,
2024-02-10 22:24:37 +01:00
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {},
2024-02-10 23:19:35 +01:00
"outputs": [],
2024-02-10 22:24:37 +01:00
"source": [
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 235,
2024-02-10 22:24:37 +01:00
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
2024-02-10 19:53:59 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-02-10 22:24:37 +01:00
" <th>customer_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
2024-02-10 22:24:37 +01:00
" <td>4</td>\n",
2024-02-12 11:08:24 +01:00
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>NaT</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-02-10 22:24:37 +01:00
" <td>3</td>\n",
" <td>222</td>\n",
" <td>124.0</td>\n",
" <td>1 days 00:28:30.169354838</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-02-10 22:24:37 +01:00
" <td>4</td>\n",
" <td>7</td>\n",
" <td>7.0</td>\n",
" <td>1 days 04:31:01.428571428</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-02-10 22:24:37 +01:00
" <td>5</td>\n",
" <td>4</td>\n",
2024-02-12 11:08:24 +01:00
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-02-10 19:53:59 +01:00
" <td>6</td>\n",
2024-02-10 22:24:37 +01:00
" <td>20</td>\n",
2024-02-12 11:08:24 +01:00
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>NaT</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-10 22:24:37 +01:00
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
2024-02-12 11:08:24 +01:00
"0 2 4 NaN NaT\n",
2024-02-10 22:24:37 +01:00
"1 3 222 124.0 1 days 00:28:30.169354838\n",
"2 4 7 7.0 1 days 04:31:01.428571428\n",
2024-02-12 11:08:24 +01:00
"3 5 4 NaN NaT\n",
"4 6 20 NaN NaT"
2024-02-10 19:53:59 +01:00
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 235,
2024-02-10 19:53:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-10 22:24:37 +01:00
"df1_campaigns_kpi.head()"
2024-02-10 19:53:59 +01:00
]
},
{
2024-02-10 22:24:37 +01:00
"cell_type": "markdown",
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
"metadata": {},
"source": [
"## KPI tickets"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 236,
2024-02-10 22:24:37 +01:00
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
"metadata": {},
"outputs": [],
"source": [
"def tickets_kpi_function(tickets_information = None):\n",
"\n",
" tickets_information_copy = tickets_information.copy()\n",
"\n",
" # Dummy : Canal de vente en ligne\n",
" liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n",
" tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n",
"\n",
" # Proportion de vente en ligne\n",
" prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()\n",
" prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n",
2024-02-11 23:54:40 +01:00
"\n",
" # Average amount\n",
" avg_amount = (tickets_information_copy.groupby([\"event_type_id\", 'name_event_types'])\n",
" .agg({\"amount\" : \"mean\"}).reset_index()\n",
" .rename(columns = {'amount' : 'avg_amount'}))\n",
"\n",
2024-02-10 22:24:37 +01:00
" \n",
2024-02-11 17:59:45 +01:00
" tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n",
2024-02-10 22:24:37 +01:00
" .groupby(['customer_id', 'event_type_id']) \n",
" .agg({'ticket_id': 'count', \n",
2024-02-11 17:59:45 +01:00
" 'purchase_id' : 'nunique',\n",
2024-02-10 22:24:37 +01:00
" 'amount' : 'sum',\n",
" 'supplier_name': 'nunique',\n",
" 'vente_internet' : 'max',\n",
" 'purchase_date' : ['min', 'max']})\n",
" .reset_index()\n",
" )\n",
" \n",
" tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n",
" \n",
" tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n",
2024-02-11 17:59:45 +01:00
" 'purchase_id_nunique' : 'nb_purchases',\n",
2024-02-10 22:24:37 +01:00
" 'amount_sum' : 'total_amount',\n",
" 'supplier_name_nunique' : 'nb_suppliers', \n",
" 'customer_id_' : 'customer_id',\n",
" 'event_type_id_' : 'event_type_id'}, inplace = True)\n",
" \n",
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
2024-02-11 17:59:45 +01:00
" tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours\n",
2024-02-10 22:24:37 +01:00
"\n",
2024-02-11 17:59:45 +01:00
" # Convertir date et en chiffre\n",
" max_date = tickets_kpi['purchase_date_max'].max()\n",
" tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')\n",
" tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')\n",
"\n",
" \n",
2024-02-10 22:24:37 +01:00
" tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')\n",
" tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-11 23:54:40 +01:00
" tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')\n",
"\n",
2024-02-10 22:24:37 +01:00
" return tickets_kpi\n",
" "
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 237,
2024-02-10 22:24:37 +01:00
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 238,
2024-02-11 17:59:45 +01:00
"id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5",
2024-02-10 19:53:59 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>event_type_id</th>\n",
" <th>nb_tickets</th>\n",
2024-02-11 17:59:45 +01:00
" <th>nb_purchases</th>\n",
2024-02-10 22:24:37 +01:00
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
2024-02-12 11:08:24 +01:00
" <th>name_event_types</th>\n",
" <th>avg_amount</th>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>384226</td>\n",
2024-02-11 17:59:45 +01:00
" <td>194790</td>\n",
2024-02-10 22:24:37 +01:00
" <td>2686540.5</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3262.190868</td>\n",
" <td>4.179306</td>\n",
" <td>3258.011562</td>\n",
2024-02-10 22:24:37 +01:00
" <td>51.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
2024-02-11 17:59:45 +01:00
" <th>1</th>\n",
2024-02-10 19:53:59 +01:00
" <td>1</td>\n",
2024-02-11 17:59:45 +01:00
" <td>4</td>\n",
" <td>453242</td>\n",
" <td>228945</td>\n",
" <td>3248965.5</td>\n",
2024-02-10 22:24:37 +01:00
" <td>6</td>\n",
" <td>1</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3698.198229</td>\n",
" <td>5.221840</td>\n",
" <td>3692.976389</td>\n",
" <td>2988.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>spectacle vivant</td>\n",
" <td>7.762474</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>201750</td>\n",
2024-02-11 17:59:45 +01:00
" <td>107110</td>\n",
2024-02-10 22:24:37 +01:00
" <td>1459190.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>6</td>\n",
2024-02-10 22:24:37 +01:00
" <td>1</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3803.369792</td>\n",
" <td>0.146331</td>\n",
" <td>3803.223461</td>\n",
2024-02-10 22:24:37 +01:00
" <td>9.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>offre muséale groupe</td>\n",
" <td>4.452618</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" <tr>\n",
2024-02-11 17:59:45 +01:00
" <th>3</th>\n",
" <td>1</td>\n",
2024-02-10 22:24:37 +01:00
" <td>6</td>\n",
2024-02-11 17:59:45 +01:00
" <td>217356</td>\n",
" <td>111786</td>\n",
" <td>1435871.5</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2502.715509</td>\n",
" <td>1408.715532</td>\n",
" <td>1093.999977</td>\n",
" <td>5.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
2024-02-11 17:59:45 +01:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>143</td>\n",
" <td>143</td>\n",
2024-02-10 22:24:37 +01:00
" <td>0.0</td>\n",
" <td>1</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0</td>\n",
" <td>2041.274549</td>\n",
" <td>1340.308160</td>\n",
" <td>700.966389</td>\n",
" <td>0.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-11 17:59:45 +01:00
" customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
"0 1 2 384226 194790 2686540.5 \n",
"1 1 4 453242 228945 3248965.5 \n",
"2 1 5 201750 107110 1459190.0 \n",
"3 1 6 217356 111786 1435871.5 \n",
"4 2 2 143 143 0.0 \n",
"\n",
" nb_suppliers vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 7 1 3262.190868 4.179306 \n",
"1 6 1 3698.198229 5.221840 \n",
"2 6 1 3803.369792 0.146331 \n",
"3 5 1 2502.715509 1408.715532 \n",
"4 1 0 2041.274549 1340.308160 \n",
"\n",
2024-02-12 11:08:24 +01:00
" time_between_purchase nb_tickets_internet name_event_types \\\n",
"0 3258.011562 51.0 offre muséale individuel \n",
"1 3692.976389 2988.0 spectacle vivant \n",
"2 3803.223461 9.0 offre muséale groupe \n",
"3 1093.999977 5.0 formule adhésion \n",
"4 700.966389 0.0 offre muséale individuel \n",
"\n",
" avg_amount \n",
"0 6.150659 \n",
"1 7.762474 \n",
"2 4.452618 \n",
"3 6.439463 \n",
"4 6.150659 "
2024-02-10 19:53:59 +01:00
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 238,
2024-02-10 19:53:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-11 17:59:45 +01:00
"df1_tickets_kpi.head()"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 239,
2024-02-11 17:59:45 +01:00
"id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
"metadata": {},
"outputs": [],
"source": [
"# Exportation vers 'projet-bdc2324-team1'\n",
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
"FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n",
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df1_tickets_kpi.to_csv(file_out, index = False)"
2024-02-10 22:24:37 +01:00
]
},
{
"cell_type": "markdown",
"id": "f1d7f7ba-361b-467d-b375-b09c149185f7",
"metadata": {},
"source": [
"## Alexis' work"
2024-02-10 19:53:59 +01:00
]
},
2024-02-11 11:47:58 +01:00
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 240,
"id": "484becad-0390-48a8-923b-b03a4facc7ae",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>category_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of_ticket_name</th>\n",
" <th>amount</th>\n",
" <th>children</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>8.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>20662815</td>\n",
" <td>1256135</td>\n",
" <td>8007697</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 17:23:54+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>20662816</td>\n",
" <td>1256136</td>\n",
" <td>8007698</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>20662817</td>\n",
" <td>1256136</td>\n",
" <td>8007698</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>20662818</td>\n",
" <td>1256137</td>\n",
" <td>8007699</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>20662819</td>\n",
" <td>1256137</td>\n",
" <td>8007699</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id category_id \\\n",
"0 13070859 48187 5107462 4 13 \n",
"1 13070860 48187 5107462 4 13 \n",
"2 13070861 48187 5107462 4 13 \n",
"3 13070862 48187 5107462 4 13 \n",
"4 13070863 48187 5107462 4 13 \n",
"... ... ... ... ... ... \n",
"1826667 20662815 1256135 8007697 5 1 \n",
"1826668 20662816 1256136 8007698 5 1 \n",
"1826669 20662817 1256136 8007698 5 1 \n",
"1826670 20662818 1256137 8007699 5 1 \n",
"1826671 20662819 1256137 8007699 5 1 \n",
"\n",
" supplier_name purchase_date type_of_ticket_name amount \\\n",
"0 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"1 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"2 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"3 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"4 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"... ... ... ... ... \n",
"1826667 vente en ligne 2023-11-08 17:23:54+00:00 Atelier 11.0 \n",
"1826668 vente en ligne 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
"1826669 vente en ligne 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
"1826670 vente en ligne 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
"1826671 vente en ligne 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
"\n",
" children is_full_price name_event_types name_facilities \\\n",
"0 pricing_formula False spectacle vivant mucem \n",
"1 pricing_formula False spectacle vivant mucem \n",
"2 pricing_formula False spectacle vivant mucem \n",
"3 pricing_formula False spectacle vivant mucem \n",
"4 pricing_formula False spectacle vivant mucem \n",
"... ... ... ... ... \n",
"1826667 pricing_formula False offre muséale groupe mucem \n",
"1826668 pricing_formula False offre muséale groupe mucem \n",
"1826669 pricing_formula False offre muséale groupe mucem \n",
"1826670 pricing_formula False offre muséale groupe mucem \n",
"1826671 pricing_formula False offre muséale groupe mucem \n",
"\n",
" name_categories name_events name_seasons \n",
"0 indiv prog enfant l'école des magiciens 2018 \n",
"1 indiv prog enfant l'école des magiciens 2018 \n",
"2 indiv prog enfant l'école des magiciens 2018 \n",
"3 indiv prog enfant l'école des magiciens 2018 \n",
"4 indiv prog enfant l'école des magiciens 2018 \n",
"... ... ... ... \n",
"1826667 indiv entrées tp NaN 2023 \n",
"1826668 indiv entrées tp NaN 2023 \n",
"1826669 indiv entrées tp NaN 2023 \n",
"1826670 indiv entrées tp NaN 2023 \n",
"1826671 indiv entrées tp NaN 2023 \n",
"\n",
"[1826672 rows x 16 columns]"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Look at categories\n",
"df1_products_purchased_reduced\n"
]
},
{
"cell_type": "code",
"execution_count": 241,
"id": "abb171a5-145f-40d0-91af-c51db4593d04",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 13, 16, 10, 1, 3, 2, 4, 6982, 5, 28, 8,\n",
" 41, 15, 39, 17, 9, 26, 2252, 6, 7, 1108, 2395,\n",
" 30, 2450, 14, 11])"
]
},
"execution_count": 241,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_products_purchased_reduced['category_id'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 242,
"id": "750c5753-ea45-4deb-a934-994568185013",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['indiv prog enfant', 'indiv activité enfant', 'indiv entrées fa',\n",
" 'indiv entrées tp', 'indiv prog tp', 'indiv entrées gr',\n",
" 'indiv prog tr', nan, 'indiv entrées tr', 'indiv activité tp',\n",
" 'indiv prog gr', 'indiv activité tr', 'groupe autonome entrées tp',\n",
" 'indiv activité gr', 'groupe forfait scolaire',\n",
" 'groupe forfait entrées gr', 'en nb entrées tp',\n",
" 'groupe autonome gr', 'groupe forfait entrées tr',\n",
" 'groupe autonome entrées gr', 'groupe forfait adulte',\n",
" 'groupe autonome adulte', 'en nb entrées gr',\n",
" 'groupe forfait etudiant', 'en nb entrées tr',\n",
" 'groupe autonome entrées tr'], dtype=object)"
]
},
"execution_count": 242,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_products_purchased_reduced['name_categories'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 243,
"id": "7a2c3124-26d0-49e1-9bb4-d7ff58053454",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>event_type_id</th>\n",
" <th>nb_categories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customer_id event_type_id nb_categories\n",
"0 1 2 14\n",
"1 1 4 9\n",
"2 1 5 5\n",
"3 1 6 9\n",
"4 2 2 1"
]
},
"execution_count": 243,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_cat = (df1_products_purchased_reduced.groupby(by=['customer_id', 'event_type_id'])\n",
" .agg({'category_id' : 'nunique'}).reset_index()\n",
" .rename(columns = {'category_id' : 'nb_categories'}))\n",
"\n",
"nb_cat.head()"
]
},
{
"cell_type": "code",
"execution_count": 244,
2024-02-11 17:59:45 +01:00
"id": "83230baa-9a8a-4614-b629-e99c2505c696",
2024-02-11 11:47:58 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>...</th>\n",
2024-02-11 17:59:45 +01:00
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
2024-02-11 11:47:58 +01:00
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>name_event_types</th>\n",
" <th>avg_amount</th>\n",
2024-02-12 11:08:24 +01:00
" <th>nb_categories</th>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-02-12 11:08:24 +01:00
" <th>0</th>\n",
2024-02-11 11:47:58 +01:00
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>2686540.5</td>\n",
" <td>7.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>1.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3262.190868</td>\n",
" <td>4.179306</td>\n",
" <td>3258.011562</td>\n",
2024-02-11 11:47:58 +01:00
" <td>51.0</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
2024-02-12 11:08:24 +01:00
" <td>14.0</td>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" <tr>\n",
2024-02-12 11:08:24 +01:00
" <th>1</th>\n",
2024-02-11 11:47:58 +01:00
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>1435871.5</td>\n",
" <td>5.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>1.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>2502.715509</td>\n",
" <td>1408.715532</td>\n",
" <td>1093.999977</td>\n",
2024-02-11 11:47:58 +01:00
" <td>5.0</td>\n",
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
2024-02-12 11:08:24 +01:00
" <td>9.0</td>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" <tr>\n",
2024-02-12 11:08:24 +01:00
" <th>2</th>\n",
2024-02-11 11:47:58 +01:00
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3248965.5</td>\n",
" <td>6.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>1.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3698.198229</td>\n",
" <td>5.221840</td>\n",
" <td>3692.976389</td>\n",
2024-02-11 11:47:58 +01:00
" <td>2988.0</td>\n",
" <td>spectacle vivant</td>\n",
" <td>7.762474</td>\n",
2024-02-12 11:08:24 +01:00
" <td>9.0</td>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" <tr>\n",
2024-02-12 11:08:24 +01:00
" <th>3</th>\n",
2024-02-11 11:47:58 +01:00
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>1459190.0</td>\n",
" <td>6.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>1.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3803.369792</td>\n",
" <td>0.146331</td>\n",
" <td>3803.223461</td>\n",
2024-02-11 11:47:58 +01:00
" <td>9.0</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>4.452618</td>\n",
2024-02-12 11:08:24 +01:00
" <td>5.0</td>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" <tr>\n",
2024-02-12 11:08:24 +01:00
" <th>4</th>\n",
2024-02-11 11:47:58 +01:00
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0.0</td>\n",
" <td>1.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>0.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>1705.261192</td>\n",
" <td>1456.333715</td>\n",
" <td>248.927477</td>\n",
2024-02-11 11:47:58 +01:00
" <td>0.0</td>\n",
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
2024-02-12 11:08:24 +01:00
" <td>1.0</td>\n",
2024-02-11 11:47:58 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-02-12 11:08:24 +01:00
"<p>5 rows × 38 columns</p>\n",
2024-02-11 11:47:58 +01:00
"</div>"
],
"text/plain": [
2024-02-12 11:08:24 +01:00
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 1 NaN 2 False 2 True \n",
"1 1 NaN 2 False 2 True \n",
"2 1 NaN 2 False 2 True \n",
"3 1 NaN 2 False 2 True \n",
"4 2 NaN 2 False 1 True \n",
2024-02-11 11:47:58 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" opt_in structure_id profession language ... total_amount nb_suppliers \\\n",
"0 False NaN NaN NaN ... 2686540.5 7.0 \n",
"1 False NaN NaN NaN ... 1435871.5 5.0 \n",
"2 False NaN NaN NaN ... 3248965.5 6.0 \n",
"3 False NaN NaN NaN ... 1459190.0 6.0 \n",
"4 True NaN NaN NaN ... 0.0 1.0 \n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 1.0 3262.190868 4.179306 \n",
"1 1.0 2502.715509 1408.715532 \n",
"2 1.0 3698.198229 5.221840 \n",
"3 1.0 3803.369792 0.146331 \n",
"4 0.0 1705.261192 1456.333715 \n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" time_between_purchase nb_tickets_internet name_event_types \\\n",
"0 3258.011562 51.0 offre muséale individuel \n",
"1 1093.999977 5.0 formule adhésion \n",
"2 3692.976389 2988.0 spectacle vivant \n",
"3 3803.223461 9.0 offre muséale groupe \n",
"4 248.927477 0.0 formule adhésion \n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" avg_amount nb_categories \n",
"0 6.150659 14.0 \n",
"1 6.439463 9.0 \n",
"2 7.762474 9.0 \n",
"3 4.452618 5.0 \n",
"4 6.439463 1.0 \n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-12 11:08:24 +01:00
"[5 rows x 38 columns]"
2024-02-11 11:47:58 +01:00
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 244,
2024-02-11 11:47:58 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-11 17:59:45 +01:00
"## Add customer information\n",
"df1_customer = (df1_customerplus_clean.merge(df1_tickets_kpi, how = \"left\", on='customer_id')\n",
" .sort_values(by='customer_id', ascending=True))\n",
2024-02-12 11:08:24 +01:00
"\n",
"df1_customer = df1_customer.merge(nb_cat, how='left', on=['customer_id', 'event_type_id'])\n",
2024-02-11 11:47:58 +01:00
"df1_customer.head()"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 245,
2024-02-11 17:59:45 +01:00
"id": "433921de-03ad-4024-9462-ecd267db1756",
2024-02-11 11:47:58 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>...</th>\n",
2024-02-11 17:59:45 +01:00
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>name_event_types</th>\n",
" <th>avg_amount</th>\n",
2024-02-12 11:08:24 +01:00
" <th>nb_categories</th>\n",
2024-02-11 11:47:58 +01:00
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-02-11 17:59:45 +01:00
" <td>1</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
2024-02-11 17:59:45 +01:00
" <td>2</td>\n",
2024-02-11 11:47:58 +01:00
" <td>True</td>\n",
2024-02-11 17:59:45 +01:00
" <td>False</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3262.190868</td>\n",
" <td>4.179306</td>\n",
" <td>3258.011562</td>\n",
" <td>51.0</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
2024-02-12 11:08:24 +01:00
" <td>14.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-02-11 17:59:45 +01:00
" <td>1</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
2024-02-11 17:59:45 +01:00
" <td>False</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>2502.715509</td>\n",
" <td>1408.715532</td>\n",
" <td>1093.999977</td>\n",
" <td>5.0</td>\n",
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
2024-02-12 11:08:24 +01:00
" <td>9.0</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-02-11 17:59:45 +01:00
" <td>1</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
2024-02-11 17:59:45 +01:00
" <td>2</td>\n",
2024-02-11 11:47:58 +01:00
" <td>True</td>\n",
2024-02-11 17:59:45 +01:00
" <td>False</td>\n",
2024-02-11 11:47:58 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3698.198229</td>\n",
" <td>5.221840</td>\n",
" <td>3692.976389</td>\n",
" <td>2988.0</td>\n",
" <td>spectacle vivant</td>\n",
" <td>7.762474</td>\n",
2024-02-12 11:08:24 +01:00
" <td>9.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-02-11 17:59:45 +01:00
" <td>1</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
2024-02-11 17:59:45 +01:00
" <td>False</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>3803.369792</td>\n",
" <td>0.146331</td>\n",
" <td>3803.223461</td>\n",
" <td>9.0</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>4.452618</td>\n",
2024-02-12 11:08:24 +01:00
" <td>5.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-02-11 17:59:45 +01:00
" <td>2</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
2024-02-11 17:59:45 +01:00
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>1705.261192</td>\n",
" <td>1456.333715</td>\n",
" <td>248.927477</td>\n",
" <td>0.0</td>\n",
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
2024-02-12 11:08:24 +01:00
" <td>1.0</td>\n",
2024-02-11 17:59:45 +01:00
" <td>4.0</td>\n",
2024-02-12 11:08:24 +01:00
" <td>NaN</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaT</td>\n",
2024-02-10 19:53:59 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-02-12 11:08:24 +01:00
"<p>5 rows × 41 columns</p>\n",
2024-02-10 19:53:59 +01:00
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
2024-02-11 17:59:45 +01:00
"0 1 NaN 2 False 2 True \n",
"1 1 NaN 2 False 2 True \n",
"2 1 NaN 2 False 2 True \n",
"3 1 NaN 2 False 2 True \n",
"4 2 NaN 2 False 1 True \n",
2024-02-10 19:53:59 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" opt_in structure_id profession language ... purchase_date_min \\\n",
"0 False NaN NaN NaN ... 3262.190868 \n",
"1 False NaN NaN NaN ... 2502.715509 \n",
"2 False NaN NaN NaN ... 3698.198229 \n",
"3 False NaN NaN NaN ... 3803.369792 \n",
"4 True NaN NaN NaN ... 1705.261192 \n",
2024-02-10 19:53:59 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" purchase_date_max time_between_purchase nb_tickets_internet \\\n",
"0 4.179306 3258.011562 51.0 \n",
"1 1408.715532 1093.999977 5.0 \n",
"2 5.221840 3692.976389 2988.0 \n",
"3 0.146331 3803.223461 9.0 \n",
"4 1456.333715 248.927477 0.0 \n",
2024-02-10 19:53:59 +01:00
"\n",
2024-02-12 11:08:24 +01:00
" name_event_types avg_amount nb_categories nb_campaigns \\\n",
"0 offre muséale individuel 6.150659 14.0 NaN \n",
"1 formule adhésion 6.439463 9.0 NaN \n",
"2 spectacle vivant 7.762474 9.0 NaN \n",
"3 offre muséale groupe 4.452618 5.0 NaN \n",
"4 formule adhésion 6.439463 1.0 4.0 \n",
2024-02-10 19:53:59 +01:00
"\n",
2024-02-11 17:59:45 +01:00
" nb_campaigns_opened time_to_open \n",
"0 NaN NaT \n",
"1 NaN NaT \n",
"2 NaN NaT \n",
"3 NaN NaT \n",
2024-02-12 11:08:24 +01:00
"4 NaN NaT \n",
2024-02-11 17:59:45 +01:00
"\n",
2024-02-12 11:08:24 +01:00
"[5 rows x 41 columns]"
2024-02-10 19:53:59 +01:00
]
},
2024-02-12 11:08:24 +01:00
"execution_count": 245,
2024-02-10 19:53:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-11 17:59:45 +01:00
"source": [
"# Add campaigns information\n",
"\n",
"df1_customer = df1_customer.merge(df1_campaigns_kpi, how='left', on='customer_id')\n",
"df1_customer.head()"
]
},
{
"cell_type": "code",
2024-02-12 11:08:24 +01:00
"execution_count": 246,
2024-02-11 17:59:45 +01:00
"id": "25e54131-6835-4e94-86d3-1a78520ed7bc",
"metadata": {},
"outputs": [],
"source": [
"## Exportation\n",
"\n",
"# Exportation vers 'projet-bdc2324-team1'\n",
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
"FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - customer_event.csv\"\n",
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df1_customer.to_csv(file_out, index = False)"
]
},
{
"cell_type": "markdown",
"id": "edae177c-1247-454d-b3d1-08fea37001f7",
"metadata": {},
"source": [
"## End of Alexis' work"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
"metadata": {},
"outputs": [],
2024-02-10 19:53:59 +01:00
"source": [
"# Fusion avec KPI campaigns liés au customer\n",
2024-02-11 11:47:58 +01:00
"#df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
"#df1_customer.head()"
2024-02-10 19:53:59 +01:00
]
},
{
"cell_type": "code",
2024-02-11 17:59:45 +01:00
"execution_count": null,
2024-02-10 19:53:59 +01:00
"id": "a89fad43-ee68-4081-9384-3e9f08ec6a59",
"metadata": {},
2024-02-11 17:59:45 +01:00
"outputs": [],
"source": [
"df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
"print(\"shape : \", df1_customer_product.shape)\n",
"df1_customer_product.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a19fec00-4ece-400c-937c-ce5cd8daccfd",
"metadata": {},
"outputs": [],
"source": [
"df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
]
},
{
"cell_type": "markdown",
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
"metadata": {},
"source": [
"# Fusion des bases locales"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec KPI liés au customer\n",
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
"\n",
"# Fill NaN values\n",
"df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "d53825e4-6453-45bc-94f2-7b2504ec4afb",
"metadata": {},
2024-02-10 19:53:59 +01:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
2024-02-10 22:24:37 +01:00
" <th>...</th>\n",
2024-02-11 17:59:45 +01:00
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
2024-02-10 19:53:59 +01:00
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12751</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12825</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
2024-02-10 22:24:37 +01:00
" <td>NaT</td>\n",
" <td>fr</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaN</td>\n",
" <td>1311</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11261</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13071</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
2024-02-11 17:59:45 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>653061</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-02-10 22:24:37 +01:00
" <td>...</td>\n",
2024-02-11 17:59:45 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
2024-02-10 19:53:59 +01:00
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>80.0</td>\n",
" <td>2.0</td>\n",
" <td>0 days 19:53:02.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-02-11 17:59:45 +01:00
"<p>5 rows × 28 columns</p>\n",
2024-02-10 19:53:59 +01:00
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
2024-02-11 17:59:45 +01:00
" opt_in structure_id profession language ... average_ticket_basket \\\n",
"0 True NaN NaN NaN ... NaN \n",
"1 True NaN NaN NaN ... NaN \n",
"2 True NaN NaN NaN ... NaN \n",
"3 True NaN NaN NaN ... NaN \n",
"4 False NaN NaN NaN ... NaN \n",
"\n",
" total_price purchase_count first_buying_date country age tenant_id \\\n",
"0 NaN 0 NaT fr NaN 1311 \n",
"1 NaN 0 NaT fr NaN 1311 \n",
"2 NaN 0 NaT fr NaN 1311 \n",
"3 NaN 0 NaT fr NaN 1311 \n",
"4 NaN 0 NaT NaN NaN 1311 \n",
"\n",
" nb_campaigns nb_campaigns_opened time_to_open \n",
"0 0.0 0.0 NaT \n",
"1 0.0 0.0 NaT \n",
"2 0.0 0.0 NaT \n",
"3 0.0 0.0 NaT \n",
"4 80.0 2.0 0 days 19:53:02.500000 \n",
"\n",
"[5 rows x 28 columns]"
2024-02-10 19:53:59 +01:00
]
},
2024-02-11 17:59:45 +01:00
"execution_count": 64,
2024-02-10 19:53:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-11 17:59:45 +01:00
"df1_customer.head()"
2024-02-07 23:28:55 +01:00
]
},
{
"cell_type": "code",
2024-02-11 17:59:45 +01:00
"execution_count": 67,
"id": "1e42a790-b215-4107-a969-85005da06ebd",
2024-02-05 12:51:35 +01:00
"metadata": {},
"outputs": [],
"source": [
2024-02-11 17:59:45 +01:00
"# Fusion avec KPI liés au comportement d'achat\n",
"df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')\n",
"\n",
"# Fill NaN values\n",
"df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)"
2024-02-07 23:28:55 +01:00
]
},
{
"cell_type": "code",
2024-02-11 17:59:45 +01:00
"execution_count": 66,
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
2024-02-07 23:28:55 +01:00
"metadata": {},
2024-02-11 17:59:45 +01:00
"outputs": [
{
"data": {
"text/plain": [
"Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',\n",
" 'total_amount', 'nb_suppliers', 'vente_internet_max',\n",
" 'purchase_date_min', 'purchase_date_max', 'time_between_purchase',\n",
" 'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',\n",
" 'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',\n",
" 'structure_id', 'profession', 'language', 'mcp_contact_id',\n",
" 'last_buying_date', 'max_price', 'ticket_sum', 'average_price',\n",
" 'fidelity', 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'purchase_count',\n",
" 'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',\n",
" 'nb_campaigns_opened', 'time_to_open'],\n",
" dtype='object')"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-07 23:28:55 +01:00
"source": [
2024-02-11 17:59:45 +01:00
"df1_customer_product"
2024-02-07 23:28:55 +01:00
]
},
{
"cell_type": "code",
2024-02-11 17:59:45 +01:00
"execution_count": 68,
"id": "ebf6d843-dcc0-4e83-b063-94806c0bac17",
2024-02-07 23:28:55 +01:00
"metadata": {},
"outputs": [],
"source": [
2024-02-11 17:59:45 +01:00
"## Exportation\n",
"\n",
"# Exportation vers 'projet-bdc2324-team1'\n",
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
"FILE_KEY_OUT_S3 = \"1_Output/Company 1 - Segmentation base.csv\"\n",
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df1_customer_product.to_csv(file_out, index = False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-02-12 11:08:24 +01:00
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}