2721 lines
97 KiB
Plaintext
2721 lines
97 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Business Data Challenge - Team 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"import re"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
|
||
"metadata": {},
|
||
"source": [
|
||
"Configuration de l'accès aux données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},key ='WKTGH4YGUBAT3TR0OSUR', secret = 'g8ozi6ZUrBy8DzaAip4F7zOizbr4DKf4RgYNseqU', token = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJXS1RHSDRZR1VCQVQzVFIwT1NVUiIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiaHR0cHM6Ly9vbnl4aWEubGFiLmdyb3VwZS1nZW5lcy5mciJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTcwNzU4NjUwMCwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJhbnRvaW5lLmpvdWJyZWxAZW5zYWUuZnIiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiZXhwIjoxNzA3NjczMDQ3LCJmYW1pbHlfbmFtZSI6IkpPVUJSRUwiLCJnaXZlbl9uYW1lIjoiQW50b2luZSIsImdyb3VwcyI6WyJiZGMyMzI0LXRlYW0xIl0sImlhdCI6MTcwNzU4NjY0NywiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiI1MjQ2MDZmMS1lYWM3LTQxZDgtYTEzMy04MGZjMDk0MGVlNzEiLCJuYW1lIjoiQW50b2luZSBKT1VCUkVMIiwicG9saWN5Ijoic3Rzb25seSIsInByZWZlcnJlZF91c2VybmFtZSI6ImFqb3VicmVsLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNlc3Npb25fc3RhdGUiOiI1OTk2MWNkYy0xNmFiLTQ4MTAtYWE4Zi1iZGUyMjkwNjhiNzUiLCJzaWQiOiI1OTk2MWNkYy0xNmFiLTQ4MTAtYWE4Zi1iZGUyMjkwNjhiNzUiLCJzdWIiOiIwNWYwZDk3Mi1jNWM4LTQyNmYtODAwZC00NmQ0OGU4NjkwMzUiLCJ0eXAiOiJCZWFyZXIifQ.-imw-N4bk1uCcQGobkxhsRoeBAqxC9rT7PifElbC7ODOStnwIulc7HRR2fmtiqI2PdyrfnVvzfmIPK1g056HbA')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Exemple sur Company 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Chargement données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data/1\"\n",
|
||
"liste_database = fs.ls(BUCKET)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_42764/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" df = pd.read_csv(file_in)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# loop to create dataframes from liste\n",
|
||
"files_path = liste_database\n",
|
||
"\n",
|
||
"client_number = files_path[0].split(\"/\")[1]\n",
|
||
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
"\n",
|
||
"for i in range(len(files_path)) :\n",
|
||
" current_path = files_path[i]\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
" # the pattern of the name is df1xxx\n",
|
||
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||
" globals()[nom_dataframe] = df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Cleaning functions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def cleaning_date(df, column_name):\n",
|
||
" \"\"\"\n",
|
||
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" - df: DataFrame\n",
|
||
" Le DataFrame contenant la colonne à nettoyer.\n",
|
||
" - column_name: str\n",
|
||
" Le nom de la colonne à nettoyer.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" - DataFrame\n",
|
||
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
||
" \"\"\"\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Preprocessing"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
|
||
"metadata": {},
|
||
"source": [
|
||
"## customer_plus"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_customerplus(customerplus = None):\n",
|
||
"\n",
|
||
" customerplus_copy = customerplus.copy()\n",
|
||
" \n",
|
||
" # Passage en format date\n",
|
||
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
|
||
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
|
||
" \n",
|
||
" # Selection des variables\n",
|
||
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
||
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
||
"\n",
|
||
" return customerplus_copy\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Ticket area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fonction de nettoyage et selection\n",
|
||
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
||
" # Base des tickets\n",
|
||
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des fournisseurs\n",
|
||
" suppliers = suppliers[['id', 'name']]\n",
|
||
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
" suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n",
|
||
"\n",
|
||
" # Base des types de billets\n",
|
||
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
||
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des achats\n",
|
||
" # Nettoyage de la date d'achat\n",
|
||
" cleaning_date(purchases, 'purchase_date')\n",
|
||
" # Selection des variables\n",
|
||
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
||
"\n",
|
||
" # Fusions \n",
|
||
" # Fusion avec fournisseurs\n",
|
||
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec type de tickets\n",
|
||
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec achats\n",
|
||
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_42764/3092893564.py:5: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"/tmp/ipykernel_42764/3092893564.py:9: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
"/tmp/ipykernel_42764/3092893564.py:10: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n",
|
||
"/tmp/ipykernel_42764/3092893564.py:14: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ticket_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" <th>type_of_ticket_name</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>225251</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070860</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070861</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070862</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070863</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
||
"0 13070859 225251 False vente en ligne \n",
|
||
"1 13070860 224914 False vente en ligne \n",
|
||
"2 13070861 224914 False vente en ligne \n",
|
||
"3 13070862 224914 False vente en ligne \n",
|
||
"4 13070863 224914 False vente en ligne \n",
|
||
"\n",
|
||
" type_of_ticket_name children purchase_date customer_id \n",
|
||
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
||
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Target area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
|
||
" # Target.csv cleaning\n",
|
||
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
||
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
||
" \n",
|
||
" # target_type cleaning\n",
|
||
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
||
" \n",
|
||
" #customer_target_mappings cleaning\n",
|
||
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
||
" \n",
|
||
" # Merge target et target_type\n",
|
||
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
||
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Merge\n",
|
||
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
||
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return targets_full"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_42764/3848597476.py:4: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Campaings area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
|
||
" # campaign_stats cleaning \n",
|
||
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
||
" cleaning_date(campaign_stats, 'opened_at')\n",
|
||
" cleaning_date(campaign_stats, 'sent_at')\n",
|
||
" cleaning_date(campaign_stats, 'delivered_at')\n",
|
||
" \n",
|
||
" # campaigns cleaning\n",
|
||
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
||
" cleaning_date(campaigns, 'campaign_sent_at')\n",
|
||
" \n",
|
||
" # Merge \n",
|
||
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
|
||
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return campaigns_full"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
"/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
"/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>opened_at</th>\n",
|
||
" <th>sent_at</th>\n",
|
||
" <th>delivered_at</th>\n",
|
||
" <th>campaign_name</th>\n",
|
||
" <th>campaign_service_id</th>\n",
|
||
" <th>campaign_sent_at</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>19793</td>\n",
|
||
" <td>112597</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
||
" <td>2021-03-28 16:24:18+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>14211</td>\n",
|
||
" <td>113666</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
||
" <td>2021-03-28 16:21:02+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13150</td>\n",
|
||
" <td>280561</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
||
" <td>2021-03-28 16:08:45+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>7073</td>\n",
|
||
" <td>101007</td>\n",
|
||
" <td>2021-03-28 18:11:06+00:00</td>\n",
|
||
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
||
" <td>2021-03-28 16:09:47+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5175</td>\n",
|
||
" <td>103972</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>2021-03-28 16:01:06+00:00</td>\n",
|
||
" <td>2021-03-28 16:05:03+00:00</td>\n",
|
||
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
||
" <td>404</td>\n",
|
||
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id customer_id opened_at sent_at \\\n",
|
||
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
|
||
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
|
||
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
|
||
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
|
||
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
|
||
"\n",
|
||
" delivered_at campaign_name \\\n",
|
||
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
||
"\n",
|
||
" campaign_service_id campaign_sent_at \n",
|
||
"0 404 2021-03-27 23:00:00+00:00 \n",
|
||
"1 404 2021-03-27 23:00:00+00:00 \n",
|
||
"2 404 2021-03-27 23:00:00+00:00 \n",
|
||
"3 404 2021-03-27 23:00:00+00:00 \n",
|
||
"4 404 2021-03-27 23:00:00+00:00 "
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_information.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Product area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
|
||
"metadata": {},
|
||
"source": [
|
||
"Some useful functions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data\"\n",
|
||
"directory_path = '1'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def display_databases(file_name):\n",
|
||
" \"\"\"\n",
|
||
" This function returns the file from s3 storage\n",
|
||
" \"\"\"\n",
|
||
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
|
||
" print(\"File path : \", file_path)\n",
|
||
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in, sep=\",\")\n",
|
||
" \n",
|
||
" print(\"Shape : \", df.shape)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"def remove_horodates(df):\n",
|
||
" \"\"\"\n",
|
||
" this function remove horodate columns like created_at and updated_at\n",
|
||
" \"\"\"\n",
|
||
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"def order_columns_id(df):\n",
|
||
" \"\"\"\n",
|
||
" this function puts all id columns at the beginning in order to read the dataset easier\n",
|
||
" \"\"\"\n",
|
||
" substring = 'id'\n",
|
||
" id_columns = [col for col in df.columns if substring in col]\n",
|
||
" remaining_col = [col for col in df.columns if substring not in col]\n",
|
||
" new_order = id_columns + remaining_col\n",
|
||
" return df[new_order]\n",
|
||
"\n",
|
||
"\n",
|
||
"def process_df_2(df):\n",
|
||
" \"\"\"\n",
|
||
" This function organizes dataframe\n",
|
||
" \"\"\"\n",
|
||
" df = remove_horodates(df)\n",
|
||
" print(\"Number of columns : \", len(df.columns))\n",
|
||
" df = order_columns_id(df)\n",
|
||
" print(\"Columns : \", df.columns)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"def load_dataset(name):\n",
|
||
" \"\"\"\n",
|
||
" This function loads csv file\n",
|
||
" \"\"\"\n",
|
||
" df = display_databases(name)\n",
|
||
" df = process_df_2(df)\n",
|
||
" # drop na :\n",
|
||
" #df = df.dropna(axis=1, thresh=len(df))\n",
|
||
" # if identifier in table : delete it\n",
|
||
" if 'identifier' in df.columns:\n",
|
||
" df = df.drop(columns = 'identifier')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create theme tables"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def create_products_table():\n",
|
||
" # first merge products and categories\n",
|
||
" print(\"first merge products and categories\")\n",
|
||
" products = load_dataset(\"1products.csv\")\n",
|
||
" categories = load_dataset(\"1categories.csv\")\n",
|
||
" # Drop useless columns\n",
|
||
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
|
||
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
|
||
"\n",
|
||
" #Merge\n",
|
||
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
|
||
" right_on = 'id', suffixes=('_products', '_categories'))\n",
|
||
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
|
||
" \n",
|
||
" # Second merge products_theme and type of categories\n",
|
||
" print(\"Second merge products_theme and type of categories\")\n",
|
||
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
|
||
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
|
||
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
|
||
" right_on = 'category_id' )\n",
|
||
"\n",
|
||
" # Index cleaning\n",
|
||
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
|
||
" products_theme = order_columns_id(products_theme)\n",
|
||
" return products_theme\n",
|
||
"\n",
|
||
"\n",
|
||
"def create_events_table():\n",
|
||
" # first merge events and seasons : \n",
|
||
" print(\"first merge events and seasons : \")\n",
|
||
" events = load_dataset(\"1events.csv\")\n",
|
||
" seasons = load_dataset(\"1seasons.csv\")\n",
|
||
"\n",
|
||
" # Drop useless columns\n",
|
||
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
|
||
" seasons = seasons.drop(columns = ['start_date_time'])\n",
|
||
" \n",
|
||
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
|
||
"\n",
|
||
" # Secondly merge events_theme and event_types\n",
|
||
" print(\"Secondly merge events_theme and event_types : \")\n",
|
||
" event_types = load_dataset(\"1event_types.csv\")\n",
|
||
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
|
||
" \n",
|
||
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
|
||
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
|
||
" events_theme = events_theme.drop(columns = 'id')\n",
|
||
"\n",
|
||
" # thirdly merge events_theme and facilities\n",
|
||
" print(\"thirdly merge events_theme and facilities : \")\n",
|
||
" facilities = load_dataset(\"1facilities.csv\")\n",
|
||
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
|
||
" \n",
|
||
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
|
||
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
|
||
" events_theme = events_theme.drop(columns = 'id')\n",
|
||
"\n",
|
||
" # Index cleaning\n",
|
||
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
|
||
" events_theme = order_columns_id(events_theme)\n",
|
||
" return events_theme\n",
|
||
"\n",
|
||
"\n",
|
||
"def create_representations_table():\n",
|
||
" representations = load_dataset(\"1representations.csv\")\n",
|
||
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
|
||
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
|
||
" 'representation_type_id'])\n",
|
||
" \n",
|
||
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
|
||
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
|
||
"\n",
|
||
" representations_theme = representations.merge(representations_capacity, how='left',\n",
|
||
" left_on='id', right_on='representation_id',\n",
|
||
" suffixes=('_representation', '_representation_cap'))\n",
|
||
" # index cleaning\n",
|
||
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
|
||
" representations_theme = order_columns_id(representations_theme)\n",
|
||
" return representations_theme"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"first merge products and categories\n",
|
||
"File path : bdc2324-data/1/1products.csv\n",
|
||
"Shape : (94803, 14)\n",
|
||
"Number of columns : 12\n",
|
||
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
||
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
|
||
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1categories.csv\n",
|
||
"Shape : (27, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
|
||
"Second merge products_theme and type of categories\n",
|
||
"File path : bdc2324-data/1/1type_of_categories.csv\n",
|
||
"Shape : (5, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id_products</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>pricing_formula_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" <th>products_group_id</th>\n",
|
||
" <th>product_pack_id</th>\n",
|
||
" <th>type_of_id</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>name_categories</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>10682</td>\n",
|
||
" <td>914</td>\n",
|
||
" <td>114</td>\n",
|
||
" <td>41</td>\n",
|
||
" <td>10655</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv activité tr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>478</td>\n",
|
||
" <td>273</td>\n",
|
||
" <td>131</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>471</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>20873</td>\n",
|
||
" <td>275</td>\n",
|
||
" <td>137</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20825</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>11.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>157142</td>\n",
|
||
" <td>82519</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>156773</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1341</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>93</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1175</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>8.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id_products representation_id pricing_formula_id category_id \\\n",
|
||
"0 10682 914 114 41 \n",
|
||
"1 478 273 131 1 \n",
|
||
"2 20873 275 137 1 \n",
|
||
"3 157142 82519 9 5 \n",
|
||
"4 1341 9 93 1 \n",
|
||
"\n",
|
||
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
|
||
"0 10655 1 NaN 9.0 False \n",
|
||
"1 471 1 12.0 9.5 False \n",
|
||
"2 20825 1 12.0 11.5 False \n",
|
||
"3 156773 1 NaN 8.0 False \n",
|
||
"4 1175 1 12.0 8.5 False \n",
|
||
"\n",
|
||
" name_categories \n",
|
||
"0 indiv activité tr \n",
|
||
"1 indiv entrées tp \n",
|
||
"2 indiv entrées tp \n",
|
||
"3 indiv entrées tr \n",
|
||
"4 indiv entrées tp "
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"products_theme = create_products_table()\n",
|
||
"products_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"first merge events and seasons : \n",
|
||
"File path : bdc2324-data/1/1events.csv\n",
|
||
"Shape : (1232, 12)\n",
|
||
"Number of columns : 10\n",
|
||
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
|
||
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1seasons.csv\n",
|
||
"Shape : (13, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
|
||
"Secondly merge events_theme and event_types : \n",
|
||
"File path : bdc2324-data/1/1event_types.csv\n",
|
||
"Shape : (9, 6)\n",
|
||
"Number of columns : 4\n",
|
||
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
|
||
"thirdly merge events_theme and facilities : \n",
|
||
"File path : bdc2324-data/1/1facilities.csv\n",
|
||
"Shape : (2, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>season_id</th>\n",
|
||
" <th>facility_id</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>event_type_key_id</th>\n",
|
||
" <th>facility_key_id</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>name_events</th>\n",
|
||
" <th>name_seasons</th>\n",
|
||
" <th>name_event_types</th>\n",
|
||
" <th>name_facilities</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>192</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>frontières</td>\n",
|
||
" <td>2018</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>30329</td>\n",
|
||
" <td>2767</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>offre muséale groupe</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>161</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>visite contée les chercheurs d'or indiv</td>\n",
|
||
" <td>2018</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5957</td>\n",
|
||
" <td>582</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
|
||
" <td>2021</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>8337</td>\n",
|
||
" <td>582</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>jeff koons épisodes 4</td>\n",
|
||
" <td>2021</td>\n",
|
||
" <td>spectacle vivant</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
|
||
"0 192 16 1 4 4 \n",
|
||
"1 30329 2767 1 5 5 \n",
|
||
"2 161 16 1 2 2 \n",
|
||
"3 5957 582 1 4 4 \n",
|
||
"4 8337 582 1 4 4 \n",
|
||
"\n",
|
||
" facility_key_id street_id \\\n",
|
||
"0 1 1 \n",
|
||
"1 1 1 \n",
|
||
"2 1 1 \n",
|
||
"3 1 1 \n",
|
||
"4 1 1 \n",
|
||
"\n",
|
||
" name_events name_seasons \\\n",
|
||
"0 frontières 2018 \n",
|
||
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
|
||
"2 visite contée les chercheurs d'or indiv 2018 \n",
|
||
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
|
||
"4 jeff koons épisodes 4 2021 \n",
|
||
"\n",
|
||
" name_event_types name_facilities \n",
|
||
"0 spectacle vivant mucem \n",
|
||
"1 offre muséale groupe mucem \n",
|
||
"2 offre muséale individuel mucem \n",
|
||
"3 spectacle vivant mucem \n",
|
||
"4 spectacle vivant mucem "
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"events_theme= create_events_table()\n",
|
||
"events_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"File path : bdc2324-data/1/1representations.csv\n",
|
||
"Shape : (36095, 16)\n",
|
||
"Number of columns : 14\n",
|
||
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
|
||
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
|
||
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
|
||
" dtype='object')\n",
|
||
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
|
||
"Shape : (65241, 7)\n",
|
||
"Number of columns : 5\n",
|
||
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
|
||
" 'max_filling'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>id_representation_cap</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>12384</td>\n",
|
||
" <td>123058</td>\n",
|
||
" <td>84820</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>2514</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>384</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>2515</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>383</td>\n",
|
||
" <td>269</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" event_id id_representation_cap representation_id category_id\n",
|
||
"0 12384 123058 84820 2\n",
|
||
"1 37 2514 269 2\n",
|
||
"2 37 384 269 5\n",
|
||
"3 37 2515 269 10\n",
|
||
"4 37 383 269 1"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"representation_theme = create_representations_table()\n",
|
||
"representation_theme.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create uniform product database "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def uniform_product_df():\n",
|
||
" \"\"\"\n",
|
||
" This function returns the uniform product dataset\n",
|
||
" \"\"\"\n",
|
||
" print(\"Products theme columns : \", products_theme.columns)\n",
|
||
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
||
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
||
"\n",
|
||
" products_global = products_theme.merge(representation_theme, how='left',\n",
|
||
" on= [\"representation_id\", \"category_id\"])\n",
|
||
" \n",
|
||
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
|
||
" suffixes = (\"_representation\", \"_event\"))\n",
|
||
" \n",
|
||
" products_global = order_columns_id(products_global)\n",
|
||
"\n",
|
||
" # remove useless columns \n",
|
||
" products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n",
|
||
" return products_global"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
||
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
|
||
" 'is_full_price', 'name_categories'],\n",
|
||
" dtype='object')\n",
|
||
"\n",
|
||
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
|
||
" 'category_id'],\n",
|
||
" dtype='object')\n",
|
||
"\n",
|
||
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
|
||
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
|
||
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id_products</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>pricing_formula_id</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" <th>products_group_id</th>\n",
|
||
" <th>product_pack_id</th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>id_representation_cap</th>\n",
|
||
" <th>season_id</th>\n",
|
||
" <th>facility_id</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>event_type_key_id</th>\n",
|
||
" <th>facility_key_id</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>name_categories</th>\n",
|
||
" <th>name_events</th>\n",
|
||
" <th>name_seasons</th>\n",
|
||
" <th>name_event_types</th>\n",
|
||
" <th>name_facilities</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>10682</td>\n",
|
||
" <td>914</td>\n",
|
||
" <td>114</td>\n",
|
||
" <td>41</td>\n",
|
||
" <td>10655</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>132</td>\n",
|
||
" <td>8789</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv activité tr</td>\n",
|
||
" <td>visite-jeu \"le classico des minots\" (1h30)</td>\n",
|
||
" <td>2017</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>478</td>\n",
|
||
" <td>273</td>\n",
|
||
" <td>131</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>471</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>390</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>billet mucem picasso</td>\n",
|
||
" <td>2016</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>20873</td>\n",
|
||
" <td>275</td>\n",
|
||
" <td>137</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20825</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>395</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>11.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>billet mucem picasso</td>\n",
|
||
" <td>2016</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>157142</td>\n",
|
||
" <td>82519</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>156773</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12365</td>\n",
|
||
" <td>120199</td>\n",
|
||
" <td>1754</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>offre muséale individuel</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1341</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>93</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1175</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>indiv entrées tp</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" <td>2017</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" <td>mucem</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id_products representation_id pricing_formula_id category_id \\\n",
|
||
"0 10682 914 114 41 \n",
|
||
"1 478 273 131 1 \n",
|
||
"2 20873 275 137 1 \n",
|
||
"3 157142 82519 9 5 \n",
|
||
"4 1341 9 93 1 \n",
|
||
"\n",
|
||
" products_group_id product_pack_id event_id id_representation_cap \\\n",
|
||
"0 10655 1 132 8789 \n",
|
||
"1 471 1 37 390 \n",
|
||
"2 20825 1 37 395 \n",
|
||
"3 156773 1 12365 120199 \n",
|
||
"4 1175 1 8 21 \n",
|
||
"\n",
|
||
" season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
|
||
"0 4 1 ... 5 1 1 \n",
|
||
"1 2 1 ... 2 1 1 \n",
|
||
"2 2 1 ... 2 1 1 \n",
|
||
"3 1754 1 ... 4 1 1 \n",
|
||
"4 4 1 ... 6 1 1 \n",
|
||
"\n",
|
||
" amount is_full_price name_categories \\\n",
|
||
"0 9.0 False indiv activité tr \n",
|
||
"1 9.5 False indiv entrées tp \n",
|
||
"2 11.5 False indiv entrées tp \n",
|
||
"3 8.0 False indiv entrées tr \n",
|
||
"4 8.5 False indiv entrées tp \n",
|
||
"\n",
|
||
" name_events name_seasons \\\n",
|
||
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
|
||
"1 billet mucem picasso 2016 \n",
|
||
"2 billet mucem picasso 2016 \n",
|
||
"3 NaN NaN \n",
|
||
"4 non défini 2017 \n",
|
||
"\n",
|
||
" name_event_types name_facilities \n",
|
||
"0 offre muséale individuel mucem \n",
|
||
"1 offre muséale individuel mucem \n",
|
||
"2 offre muséale individuel mucem \n",
|
||
"3 offre muséale individuel mucem \n",
|
||
"4 non défini mucem \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"products_global = uniform_product_df()\n",
|
||
"products_global.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion liée au product\n",
|
||
"df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
|
||
"\n",
|
||
"# Selection des variables d'intérêts\n",
|
||
"df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d7c3668a-c016-4bd0-837e-04af328ff14f",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Construction des variables explicatives"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "314f1b7f-ae48-4c6f-8469-9ce879043243",
|
||
"metadata": {},
|
||
"source": [
|
||
"## KPI campaigns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def campaigns_kpi_function(campaigns_information = None):\n",
|
||
" # Nombre de campagnes de mails\n",
|
||
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
||
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
|
||
" # Temps d'ouverture en min moyen \n",
|
||
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
|
||
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
|
||
"\n",
|
||
" # Nombre de mail ouvert \n",
|
||
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
|
||
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
|
||
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
||
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
|
||
"\n",
|
||
" # Fusion des indicateurs\n",
|
||
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
|
||
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
|
||
"\n",
|
||
" # Remplir les NaN : nb_campaigns_opened\n",
|
||
" campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
|
||
"\n",
|
||
" # Remplir les NaT : time_to_open (??)\n",
|
||
"\n",
|
||
" return campaigns_reduced\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_42764/3700263836.py:11: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>222</td>\n",
|
||
" <td>124.0</td>\n",
|
||
" <td>1 days 00:28:30.169354838</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>1 days 04:31:01.428571428</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
|
||
"0 2 4 0.0 NaT\n",
|
||
"1 3 222 124.0 1 days 00:28:30.169354838\n",
|
||
"2 4 7 7.0 1 days 04:31:01.428571428\n",
|
||
"3 5 4 0.0 NaT\n",
|
||
"4 6 20 0.0 NaT"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_campaigns_kpi.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
|
||
"metadata": {},
|
||
"source": [
|
||
"## KPI tickets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "b913a69e-3146-4919-b5f6-a6108532bffa",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array(['spectacle vivant', 'offre muséale individuel', 'formule adhésion',\n",
|
||
" 'offre muséale groupe'], dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_products_purchased_reduced['name_event_types'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Nombre de client assistant à plus de 2 type d'événement\n",
|
||
"nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def tickets_kpi_function(tickets_information = None):\n",
|
||
"\n",
|
||
" tickets_information_copy = tickets_information.copy()\n",
|
||
"\n",
|
||
" # Dummy : Canal de vente en ligne\n",
|
||
" liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n",
|
||
" tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n",
|
||
"\n",
|
||
" # Proportion de vente en ligne\n",
|
||
" prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()\n",
|
||
" prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n",
|
||
" \n",
|
||
" tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n",
|
||
" .groupby(['customer_id', 'event_type_id']) \n",
|
||
" .agg({'ticket_id': 'count', \n",
|
||
" 'amount' : 'sum',\n",
|
||
" 'supplier_name': 'nunique',\n",
|
||
" 'vente_internet' : 'max',\n",
|
||
" 'purchase_date' : ['min', 'max']})\n",
|
||
" .reset_index()\n",
|
||
" )\n",
|
||
" \n",
|
||
" tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n",
|
||
" \n",
|
||
" tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n",
|
||
" 'amount_sum' : 'total_amount',\n",
|
||
" 'supplier_name_nunique' : 'nb_suppliers', \n",
|
||
" 'customer_id_' : 'customer_id',\n",
|
||
" 'event_type_id_' : 'event_type_id'}, inplace = True)\n",
|
||
" \n",
|
||
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
|
||
"\n",
|
||
" tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')\n",
|
||
" tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n",
|
||
" \n",
|
||
" return tickets_kpi\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "597b241e-a83d-4b7c-8ad7-eec50295dff2",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Exportation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Exportation vers 'projet-bdc2324-team1'\n",
|
||
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
|
||
"FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n",
|
||
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
||
" df1_tickets_kpi.to_csv(file_out, index = False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>time_between_purchase</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>453242</td>\n",
|
||
" <td>3248965.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2013-09-23 14:45:01+00:00</td>\n",
|
||
" <td>2023-11-03 14:11:01+00:00</td>\n",
|
||
" <td>3692 days 23:26:00</td>\n",
|
||
" <td>2988.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>384226</td>\n",
|
||
" <td>2686540.5</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2014-12-03 14:55:37+00:00</td>\n",
|
||
" <td>2023-11-04 15:12:16+00:00</td>\n",
|
||
" <td>3258 days 00:16:39</td>\n",
|
||
" <td>51.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>217356</td>\n",
|
||
" <td>1435871.5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2017-01-01 02:20:08+00:00</td>\n",
|
||
" <td>2019-12-31 02:20:06+00:00</td>\n",
|
||
" <td>1093 days 23:59:58</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>201750</td>\n",
|
||
" <td>1459190.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2013-06-10 10:37:58+00:00</td>\n",
|
||
" <td>2023-11-08 15:59:45+00:00</td>\n",
|
||
" <td>3803 days 05:21:47</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5032</th>\n",
|
||
" <td>6733</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>14208</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2017-01-11 15:00:54+00:00</td>\n",
|
||
" <td>2019-11-27 09:47:06+00:00</td>\n",
|
||
" <td>1049 days 18:46:12</td>\n",
|
||
" <td>13497.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id event_type_id nb_tickets total_amount nb_suppliers \\\n",
|
||
"1 1 4 453242 3248965.5 6 \n",
|
||
"0 1 2 384226 2686540.5 7 \n",
|
||
"3 1 6 217356 1435871.5 5 \n",
|
||
"2 1 5 201750 1459190.0 6 \n",
|
||
"5032 6733 6 14208 0.0 3 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"1 1 2013-09-23 14:45:01+00:00 2023-11-03 14:11:01+00:00 \n",
|
||
"0 1 2014-12-03 14:55:37+00:00 2023-11-04 15:12:16+00:00 \n",
|
||
"3 1 2017-01-01 02:20:08+00:00 2019-12-31 02:20:06+00:00 \n",
|
||
"2 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n",
|
||
"5032 1 2017-01-11 15:00:54+00:00 2019-11-27 09:47:06+00:00 \n",
|
||
"\n",
|
||
" time_between_purchase nb_tickets_internet \n",
|
||
"1 3692 days 23:26:00 2988.0 \n",
|
||
"0 3258 days 00:16:39 51.0 \n",
|
||
"3 1093 days 23:59:58 5.0 \n",
|
||
"2 3803 days 05:21:47 9.0 \n",
|
||
"5032 1049 days 18:46:12 13497.0 "
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f1d7f7ba-361b-467d-b375-b09c149185f7",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Alexis' work"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "4ab1c0d2-0097-4669-b984-b6822c976740",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>avg_amount</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>6.150659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7.762474</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>4.452618</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6.439463</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" event_type_id avg_amount\n",
|
||
"0 2 6.150659\n",
|
||
"1 4 7.762474\n",
|
||
"2 5 4.452618\n",
|
||
"3 6 6.439463"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n",
|
||
" .agg({\"amount\" : \"mean\"}).reset_index()\n",
|
||
" .rename(columns = {'amount' : 'avg_amount'}))\n",
|
||
"\n",
|
||
"avg_amount"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>avg_amount</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>384226</td>\n",
|
||
" <td>6.150659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>453242</td>\n",
|
||
" <td>7.762474</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>201750</td>\n",
|
||
" <td>4.452618</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>217356</td>\n",
|
||
" <td>6.439463</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>143</td>\n",
|
||
" <td>6.150659</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id event_type_id nb_tickets avg_amount\n",
|
||
"0 1 2 384226 6.150659\n",
|
||
"1 1 4 453242 7.762474\n",
|
||
"2 1 5 201750 4.452618\n",
|
||
"3 1 6 217356 6.439463\n",
|
||
"4 2 2 143 6.150659"
|
||
]
|
||
},
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n",
|
||
" .agg({\"ticket_id\" : \"count\"}).reset_index()\n",
|
||
" .rename(columns = {'ticket_id' : 'nb_tickets'})\n",
|
||
" .merge(avg_amount, how='left', on='event_type_id'))\n",
|
||
"nb_tickets.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>birthdate</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>profession</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>average_ticket_basket</th>\n",
|
||
" <th>total_price</th>\n",
|
||
" <th>purchase_count</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>12751</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>12825</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>11261</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13071</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>653061</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0 days 19:53:02.500000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 28 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
||
"0 12751 NaN 2 False 1 True \n",
|
||
"1 12825 NaN 2 False 2 True \n",
|
||
"2 11261 NaN 2 False 1 True \n",
|
||
"3 13071 NaN 2 False 2 True \n",
|
||
"4 653061 NaN 10 False 2 True \n",
|
||
"\n",
|
||
" opt_in structure_id profession language ... average_ticket_basket \\\n",
|
||
"0 True NaN NaN NaN ... NaN \n",
|
||
"1 True NaN NaN NaN ... NaN \n",
|
||
"2 True NaN NaN NaN ... NaN \n",
|
||
"3 True NaN NaN NaN ... NaN \n",
|
||
"4 False NaN NaN NaN ... NaN \n",
|
||
"\n",
|
||
" total_price purchase_count first_buying_date country age tenant_id \\\n",
|
||
"0 NaN 0 NaT fr NaN 1311 \n",
|
||
"1 NaN 0 NaT fr NaN 1311 \n",
|
||
"2 NaN 0 NaT fr NaN 1311 \n",
|
||
"3 NaN 0 NaT fr NaN 1311 \n",
|
||
"4 NaN 0 NaT NaN NaN 1311 \n",
|
||
"\n",
|
||
" nb_campaigns nb_campaigns_opened time_to_open \n",
|
||
"0 NaN NaN NaT \n",
|
||
"1 NaN NaN NaT \n",
|
||
"2 NaN NaN NaT \n",
|
||
"3 NaN NaN NaT \n",
|
||
"4 80.0 2.0 0 days 19:53:02.500000 \n",
|
||
"\n",
|
||
"[5 rows x 28 columns]"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Fusion avec KPI campaigns liés au customer\n",
|
||
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
||
"df1_customer.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "a89fad43-ee68-4081-9384-3e9f08ec6a59",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"shape : (156289, 31)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>birthdate</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>profession</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" <th>time_to_open</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>avg_amount</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>12751</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>12825</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>11261</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13071</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>653061</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaT</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1311</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0 days 19:53:02.500000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 31 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
||
"0 12751 NaN 2 False 1 True \n",
|
||
"1 12825 NaN 2 False 2 True \n",
|
||
"2 11261 NaN 2 False 1 True \n",
|
||
"3 13071 NaN 2 False 2 True \n",
|
||
"4 653061 NaN 10 False 2 True \n",
|
||
"\n",
|
||
" opt_in structure_id profession language ... first_buying_date country \\\n",
|
||
"0 True NaN NaN NaN ... NaT fr \n",
|
||
"1 True NaN NaN NaN ... NaT fr \n",
|
||
"2 True NaN NaN NaN ... NaT fr \n",
|
||
"3 True NaN NaN NaN ... NaT fr \n",
|
||
"4 False NaN NaN NaN ... NaT NaN \n",
|
||
"\n",
|
||
" age tenant_id nb_campaigns nb_campaigns_opened time_to_open \\\n",
|
||
"0 NaN 1311 NaN NaN NaT \n",
|
||
"1 NaN 1311 NaN NaN NaT \n",
|
||
"2 NaN 1311 NaN NaN NaT \n",
|
||
"3 NaN 1311 NaN NaN NaT \n",
|
||
"4 NaN 1311 80.0 2.0 0 days 19:53:02.500000 \n",
|
||
"\n",
|
||
" event_type_id nb_tickets avg_amount \n",
|
||
"0 NaN NaN NaN \n",
|
||
"1 NaN NaN NaN \n",
|
||
"2 NaN NaN NaN \n",
|
||
"3 NaN NaN NaN \n",
|
||
"4 NaN NaN NaN \n",
|
||
"\n",
|
||
"[5 rows x 31 columns]"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
|
||
"print(\"shape : \", df1_customer_product.shape)\n",
|
||
"df1_customer_product.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "a19fec00-4ece-400c-937c-ce5cd8daccfd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Fusion des bases locales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec KPI liés au customer\n",
|
||
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "1e42a790-b215-4107-a969-85005da06ebd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec KPI liés au comportement d'achat\n",
|
||
"df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# df1_customer_product"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|