2024-02-04 16:02:50 +01:00
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Business Data Challenge - Team 1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 208,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"import s3fs\n",
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"import re\n",
|
|
|
|
|
"import warnings"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Configuration de l'accès aux données"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 209,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Create filesystem object\n",
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
|
|
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 210,
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"id": "a9b84234-d5df-4c43-a9cd-80cfe2f1e34d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Ignore warning\n",
|
|
|
|
|
"warnings.filterwarnings('ignore')"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Exemple sur Company 1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "db26e59a-927c-407e-b54b-1815473b0b34",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Chargement données"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 211,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"BUCKET = \"bdc2324-data/1\"\n",
|
|
|
|
|
"liste_database = fs.ls(BUCKET)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 212,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
|
|
|
|
"metadata": {},
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"outputs": [],
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"# loop to create dataframes from liste\n",
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"files_path = liste_database\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"client_number = files_path[0].split(\"/\")[1]\n",
|
|
|
|
|
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"for i in range(len(files_path)) :\n",
|
|
|
|
|
" current_path = files_path[i]\n",
|
|
|
|
|
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
|
|
|
|
" df = pd.read_csv(file_in)\n",
|
|
|
|
|
" # the pattern of the name is df1xxx\n",
|
|
|
|
|
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
|
|
|
|
" globals()[nom_dataframe] = df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Cleaning functions"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 213,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def cleaning_date(df, column_name):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Parameters:\n",
|
|
|
|
|
" - df: DataFrame\n",
|
|
|
|
|
" Le DataFrame contenant la colonne à nettoyer.\n",
|
|
|
|
|
" - column_name: str\n",
|
|
|
|
|
" Le nom de la colonne à nettoyer.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Returns:\n",
|
|
|
|
|
" - DataFrame\n",
|
|
|
|
|
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
|
|
|
|
" return df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "398804d8-2225-4fd3-bceb-75ab1588e359",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Preprocessing"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## customer_plus"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 214,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"def preprocessing_customerplus(customerplus = None):\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" customerplus_copy = customerplus.copy()\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Passage en format date\n",
|
|
|
|
|
" cleaning_date(customerplus_copy, 'first_buying_date')\n",
|
|
|
|
|
" cleaning_date(customerplus_copy, 'last_visiting_date')\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Selection des variables\n",
|
|
|
|
|
" customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
|
|
|
|
" customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return customerplus_copy\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 215,
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
|
|
|
|
|
]
|
2024-02-04 16:02:50 +01:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Ticket area"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 216,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Fonction de nettoyage et selection\n",
|
|
|
|
|
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
|
|
|
|
" # Base des tickets\n",
|
|
|
|
|
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
|
|
|
|
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Base des fournisseurs\n",
|
|
|
|
|
" suppliers = suppliers[['id', 'name']]\n",
|
|
|
|
|
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
2024-02-10 14:23:17 +01:00
|
|
|
|
" suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
|
|
|
|
" # Base des types de billets\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
|
|
|
|
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
|
|
|
|
" # Base des achats\n",
|
|
|
|
|
" # Nettoyage de la date d'achat\n",
|
|
|
|
|
" cleaning_date(purchases, 'purchase_date')\n",
|
|
|
|
|
" # Selection des variables\n",
|
|
|
|
|
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Fusions \n",
|
|
|
|
|
" # Fusion avec fournisseurs\n",
|
|
|
|
|
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
|
|
|
|
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
|
|
|
|
" \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" # Fusion avec type de tickets\n",
|
|
|
|
|
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
|
|
|
|
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" \n",
|
|
|
|
|
" # Fusion avec achats\n",
|
|
|
|
|
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" ticket_information.drop(['id'], axis = 1, inplace=True)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
|
|
|
|
" return ticket_information"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 217,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
|
|
|
|
"metadata": {},
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"outputs": [],
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 218,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
|
|
|
|
|
"metadata": {},
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>ticket_id</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>purchase_id</th>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <th>product_id</th>\n",
|
|
|
|
|
" <th>is_from_subscription</th>\n",
|
|
|
|
|
" <th>supplier_name</th>\n",
|
|
|
|
|
" <th>type_of_ticket_name</th>\n",
|
|
|
|
|
" <th>children</th>\n",
|
|
|
|
|
" <th>purchase_date</th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>13070859</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>5107462</td>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <td>225251</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>13070860</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>5107462</td>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>13070861</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>5107462</td>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>13070862</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>5107462</td>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>13070863</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>5107462</td>\n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
" <td>224914</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" ticket_id purchase_id product_id is_from_subscription supplier_name \\\n",
|
|
|
|
|
"0 13070859 5107462 225251 False vente en ligne \n",
|
|
|
|
|
"1 13070860 5107462 224914 False vente en ligne \n",
|
|
|
|
|
"2 13070861 5107462 224914 False vente en ligne \n",
|
|
|
|
|
"3 13070862 5107462 224914 False vente en ligne \n",
|
|
|
|
|
"4 13070863 5107462 224914 False vente en ligne \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
|
" type_of_ticket_name children purchase_date customer_id \n",
|
|
|
|
|
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
|
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
|
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
|
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
|
|
|
|
|
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
|
2024-02-05 22:03:49 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 218,
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
|
"df1_ticket_information.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "096e47f4-1d65-4575-989d-83227eedad2b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Target area"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 219,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
|
|
|
|
|
" # Target.csv cleaning\n",
|
|
|
|
|
" targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
|
|
|
|
" targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # target_type cleaning\n",
|
|
|
|
|
" target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" #customer_target_mappings cleaning\n",
|
|
|
|
|
" customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Merge target et target_type\n",
|
|
|
|
|
" targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
|
|
|
|
" targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Merge\n",
|
|
|
|
|
" targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
|
|
|
|
" targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return targets_full"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 220,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
|
|
|
|
|
"metadata": {},
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"outputs": [],
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Campaings area"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 221,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
|
|
|
|
|
" # campaign_stats cleaning \n",
|
|
|
|
|
" campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
|
|
|
|
" cleaning_date(campaign_stats, 'opened_at')\n",
|
|
|
|
|
" cleaning_date(campaign_stats, 'sent_at')\n",
|
|
|
|
|
" cleaning_date(campaign_stats, 'delivered_at')\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # campaigns cleaning\n",
|
|
|
|
|
" campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
|
|
|
|
" cleaning_date(campaigns, 'campaign_sent_at')\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Merge \n",
|
|
|
|
|
" campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
|
|
|
|
|
" campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return campaigns_full"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 222,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
|
|
|
|
|
"metadata": {},
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"outputs": [],
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 223,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"metadata": {
|
|
|
|
|
"scrolled": true
|
|
|
|
|
},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>id</th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>opened_at</th>\n",
|
|
|
|
|
" <th>sent_at</th>\n",
|
|
|
|
|
" <th>delivered_at</th>\n",
|
|
|
|
|
" <th>campaign_name</th>\n",
|
|
|
|
|
" <th>campaign_service_id</th>\n",
|
|
|
|
|
" <th>campaign_sent_at</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>19793</td>\n",
|
|
|
|
|
" <td>112597</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:24:18+00:00</td>\n",
|
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
|
" <td>404</td>\n",
|
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>14211</td>\n",
|
|
|
|
|
" <td>113666</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:01:09+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:21:02+00:00</td>\n",
|
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
|
" <td>404</td>\n",
|
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>13150</td>\n",
|
|
|
|
|
" <td>280561</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:08:45+00:00</td>\n",
|
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
|
" <td>404</td>\n",
|
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>7073</td>\n",
|
|
|
|
|
" <td>101007</td>\n",
|
|
|
|
|
" <td>2021-03-28 18:11:06+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:00:59+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:09:47+00:00</td>\n",
|
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
|
" <td>404</td>\n",
|
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>5175</td>\n",
|
|
|
|
|
" <td>103972</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:01:06+00:00</td>\n",
|
|
|
|
|
" <td>2021-03-28 16:05:03+00:00</td>\n",
|
|
|
|
|
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
|
|
|
|
|
" <td>404</td>\n",
|
|
|
|
|
" <td>2021-03-27 23:00:00+00:00</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-05 22:10:07 +01:00
|
|
|
|
" id customer_id opened_at sent_at \\\n",
|
|
|
|
|
"0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n",
|
|
|
|
|
"1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n",
|
|
|
|
|
"2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n",
|
|
|
|
|
"3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n",
|
|
|
|
|
"4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
|
" delivered_at campaign_name \\\n",
|
|
|
|
|
"0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
|
"1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
|
"2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
|
"3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
|
|
|
|
"4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n",
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"\n",
|
2024-02-05 22:10:07 +01:00
|
|
|
|
" campaign_service_id campaign_sent_at \n",
|
|
|
|
|
"0 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
|
"1 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
|
"2 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
|
"3 404 2021-03-27 23:00:00+00:00 \n",
|
|
|
|
|
"4 404 2021-03-27 23:00:00+00:00 "
|
2024-02-05 22:03:49 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 223,
|
2024-02-05 22:03:49 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"source": [
|
2024-02-05 22:10:07 +01:00
|
|
|
|
"df1_campaigns_information.head()"
|
2024-02-05 12:51:35 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"## Product area"
|
2024-02-05 12:51:35 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Some useful functions"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 224,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"BUCKET = \"bdc2324-data\"\n",
|
|
|
|
|
"directory_path = '1'"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 225,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def display_databases(file_name):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" This function returns the file from s3 storage\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
|
|
|
|
|
" print(\"File path : \", file_path)\n",
|
|
|
|
|
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
|
|
|
|
|
" df = pd.read_csv(file_in, sep=\",\")\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" print(\"Shape : \", df.shape)\n",
|
|
|
|
|
" return df\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def remove_horodates(df):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" this function remove horodate columns like created_at and updated_at\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
|
|
|
|
|
" return df\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def order_columns_id(df):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" this function puts all id columns at the beginning in order to read the dataset easier\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" substring = 'id'\n",
|
|
|
|
|
" id_columns = [col for col in df.columns if substring in col]\n",
|
|
|
|
|
" remaining_col = [col for col in df.columns if substring not in col]\n",
|
|
|
|
|
" new_order = id_columns + remaining_col\n",
|
|
|
|
|
" return df[new_order]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def process_df_2(df):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" This function organizes dataframe\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" df = remove_horodates(df)\n",
|
|
|
|
|
" print(\"Number of columns : \", len(df.columns))\n",
|
|
|
|
|
" df = order_columns_id(df)\n",
|
|
|
|
|
" print(\"Columns : \", df.columns)\n",
|
|
|
|
|
" return df\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def load_dataset(name):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" This function loads csv file\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" df = display_databases(name)\n",
|
|
|
|
|
" df = process_df_2(df)\n",
|
|
|
|
|
" # drop na :\n",
|
|
|
|
|
" #df = df.dropna(axis=1, thresh=len(df))\n",
|
|
|
|
|
" # if identifier in table : delete it\n",
|
|
|
|
|
" if 'identifier' in df.columns:\n",
|
|
|
|
|
" df = df.drop(columns = 'identifier')\n",
|
|
|
|
|
" return df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Create theme tables"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 226,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def create_products_table():\n",
|
|
|
|
|
" # first merge products and categories\n",
|
|
|
|
|
" print(\"first merge products and categories\")\n",
|
|
|
|
|
" products = load_dataset(\"1products.csv\")\n",
|
|
|
|
|
" categories = load_dataset(\"1categories.csv\")\n",
|
|
|
|
|
" # Drop useless columns\n",
|
|
|
|
|
" products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
|
|
|
|
|
" categories = categories.drop(columns = ['extra_field', 'quota'])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" #Merge\n",
|
|
|
|
|
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
|
|
|
|
|
" right_on = 'id', suffixes=('_products', '_categories'))\n",
|
|
|
|
|
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Second merge products_theme and type of categories\n",
|
|
|
|
|
" print(\"Second merge products_theme and type of categories\")\n",
|
|
|
|
|
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
|
|
|
|
|
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
|
|
|
|
|
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
|
|
|
|
|
" right_on = 'category_id' )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Index cleaning\n",
|
|
|
|
|
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
|
|
|
|
|
" products_theme = order_columns_id(products_theme)\n",
|
|
|
|
|
" return products_theme\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def create_events_table():\n",
|
|
|
|
|
" # first merge events and seasons : \n",
|
|
|
|
|
" print(\"first merge events and seasons : \")\n",
|
|
|
|
|
" events = load_dataset(\"1events.csv\")\n",
|
|
|
|
|
" seasons = load_dataset(\"1seasons.csv\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Drop useless columns\n",
|
|
|
|
|
" events = events.drop(columns = ['manual_added', 'is_display'])\n",
|
|
|
|
|
" seasons = seasons.drop(columns = ['start_date_time'])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Secondly merge events_theme and event_types\n",
|
|
|
|
|
" print(\"Secondly merge events_theme and event_types : \")\n",
|
|
|
|
|
" event_types = load_dataset(\"1event_types.csv\")\n",
|
|
|
|
|
" event_types = event_types.drop(columns = ['fidelity_delay'])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
|
|
|
|
|
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
|
|
|
|
|
" events_theme = events_theme.drop(columns = 'id')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # thirdly merge events_theme and facilities\n",
|
|
|
|
|
" print(\"thirdly merge events_theme and facilities : \")\n",
|
|
|
|
|
" facilities = load_dataset(\"1facilities.csv\")\n",
|
|
|
|
|
" facilities = facilities.drop(columns = ['fixed_capacity'])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
|
|
|
|
|
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
|
|
|
|
|
" events_theme = events_theme.drop(columns = 'id')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Index cleaning\n",
|
|
|
|
|
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
|
|
|
|
|
" events_theme = order_columns_id(events_theme)\n",
|
|
|
|
|
" return events_theme\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def create_representations_table():\n",
|
|
|
|
|
" representations = load_dataset(\"1representations.csv\")\n",
|
|
|
|
|
" representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
|
|
|
|
|
" 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
|
|
|
|
|
" 'representation_type_id'])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
|
|
|
|
|
" representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" representations_theme = representations.merge(representations_capacity, how='left',\n",
|
|
|
|
|
" left_on='id', right_on='representation_id',\n",
|
|
|
|
|
" suffixes=('_representation', '_representation_cap'))\n",
|
|
|
|
|
" # index cleaning\n",
|
|
|
|
|
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
|
|
|
|
|
" representations_theme = order_columns_id(representations_theme)\n",
|
|
|
|
|
" return representations_theme"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 227,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
|
|
|
|
|
"metadata": {},
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"name": "stdout",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"first merge products and categories\n",
|
|
|
|
|
"File path : bdc2324-data/1/1products.csv\n",
|
|
|
|
|
"Shape : (94803, 14)\n",
|
|
|
|
|
"Number of columns : 12\n",
|
|
|
|
|
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
|
|
|
|
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
|
|
|
|
|
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
|
|
|
|
|
" dtype='object')\n",
|
|
|
|
|
"File path : bdc2324-data/1/1categories.csv\n",
|
|
|
|
|
"Shape : (27, 7)\n",
|
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
|
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
|
|
|
|
|
"Second merge products_theme and type of categories\n",
|
|
|
|
|
"File path : bdc2324-data/1/1type_of_categories.csv\n",
|
|
|
|
|
"Shape : (5, 6)\n",
|
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
|
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>id_products</th>\n",
|
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
|
" <th>pricing_formula_id</th>\n",
|
|
|
|
|
" <th>category_id</th>\n",
|
|
|
|
|
" <th>products_group_id</th>\n",
|
|
|
|
|
" <th>product_pack_id</th>\n",
|
|
|
|
|
" <th>type_of_id</th>\n",
|
|
|
|
|
" <th>amount</th>\n",
|
|
|
|
|
" <th>is_full_price</th>\n",
|
|
|
|
|
" <th>name_categories</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>10682</td>\n",
|
|
|
|
|
" <td>914</td>\n",
|
|
|
|
|
" <td>114</td>\n",
|
|
|
|
|
" <td>41</td>\n",
|
|
|
|
|
" <td>10655</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>indiv activité tr</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>478</td>\n",
|
|
|
|
|
" <td>273</td>\n",
|
|
|
|
|
" <td>131</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>471</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
|
" <td>9.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>20873</td>\n",
|
|
|
|
|
" <td>275</td>\n",
|
|
|
|
|
" <td>137</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>20825</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
|
" <td>11.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>157142</td>\n",
|
|
|
|
|
" <td>82519</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>156773</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>indiv entrées tr</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>1341</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" <td>93</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1175</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>12.0</td>\n",
|
|
|
|
|
" <td>8.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" id_products representation_id pricing_formula_id category_id \\\n",
|
|
|
|
|
"0 10682 914 114 41 \n",
|
|
|
|
|
"1 478 273 131 1 \n",
|
|
|
|
|
"2 20873 275 137 1 \n",
|
|
|
|
|
"3 157142 82519 9 5 \n",
|
|
|
|
|
"4 1341 9 93 1 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
|
|
|
|
|
"0 10655 1 NaN 9.0 False \n",
|
|
|
|
|
"1 471 1 12.0 9.5 False \n",
|
|
|
|
|
"2 20825 1 12.0 11.5 False \n",
|
|
|
|
|
"3 156773 1 NaN 8.0 False \n",
|
|
|
|
|
"4 1175 1 12.0 8.5 False \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_categories \n",
|
|
|
|
|
"0 indiv activité tr \n",
|
|
|
|
|
"1 indiv entrées tp \n",
|
|
|
|
|
"2 indiv entrées tp \n",
|
|
|
|
|
"3 indiv entrées tr \n",
|
|
|
|
|
"4 indiv entrées tp "
|
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 227,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"products_theme = create_products_table()\n",
|
|
|
|
|
"products_theme.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 228,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"first merge events and seasons : \n",
|
|
|
|
|
"File path : bdc2324-data/1/1events.csv\n",
|
|
|
|
|
"Shape : (1232, 12)\n",
|
|
|
|
|
"Number of columns : 10\n",
|
|
|
|
|
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
|
|
|
|
|
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
|
|
|
|
|
" dtype='object')\n",
|
|
|
|
|
"File path : bdc2324-data/1/1seasons.csv\n",
|
|
|
|
|
"Shape : (13, 6)\n",
|
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
|
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
|
|
|
|
|
"Secondly merge events_theme and event_types : \n",
|
|
|
|
|
"File path : bdc2324-data/1/1event_types.csv\n",
|
|
|
|
|
"Shape : (9, 6)\n",
|
|
|
|
|
"Number of columns : 4\n",
|
|
|
|
|
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
|
|
|
|
|
"thirdly merge events_theme and facilities : \n",
|
|
|
|
|
"File path : bdc2324-data/1/1facilities.csv\n",
|
|
|
|
|
"Shape : (2, 7)\n",
|
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
|
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-04 16:02:50 +01:00
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>event_id</th>\n",
|
|
|
|
|
" <th>season_id</th>\n",
|
|
|
|
|
" <th>facility_id</th>\n",
|
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
|
" <th>event_type_key_id</th>\n",
|
|
|
|
|
" <th>facility_key_id</th>\n",
|
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
|
" <th>name_events</th>\n",
|
|
|
|
|
" <th>name_seasons</th>\n",
|
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>name_facilities</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>192</td>\n",
|
|
|
|
|
" <td>16</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>frontières</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>30329</td>\n",
|
|
|
|
|
" <td>2767</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>161</td>\n",
|
|
|
|
|
" <td>16</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>visite contée les chercheurs d'or indiv</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>5957</td>\n",
|
|
|
|
|
" <td>582</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
|
|
|
|
|
" <td>2021</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>8337</td>\n",
|
|
|
|
|
" <td>582</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>jeff koons épisodes 4</td>\n",
|
|
|
|
|
" <td>2021</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
|
|
|
|
|
"0 192 16 1 4 4 \n",
|
|
|
|
|
"1 30329 2767 1 5 5 \n",
|
|
|
|
|
"2 161 16 1 2 2 \n",
|
|
|
|
|
"3 5957 582 1 4 4 \n",
|
|
|
|
|
"4 8337 582 1 4 4 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" facility_key_id street_id \\\n",
|
|
|
|
|
"0 1 1 \n",
|
|
|
|
|
"1 1 1 \n",
|
|
|
|
|
"2 1 1 \n",
|
|
|
|
|
"3 1 1 \n",
|
|
|
|
|
"4 1 1 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_events name_seasons \\\n",
|
|
|
|
|
"0 frontières 2018 \n",
|
|
|
|
|
"1 visite guidée une autre histoire du monde (1h00) 2023 \n",
|
|
|
|
|
"2 visite contée les chercheurs d'or indiv 2018 \n",
|
|
|
|
|
"3 we dreamt of utopia and we woke up screaming. 2021 \n",
|
|
|
|
|
"4 jeff koons épisodes 4 2021 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_event_types name_facilities \n",
|
|
|
|
|
"0 spectacle vivant mucem \n",
|
|
|
|
|
"1 offre muséale groupe mucem \n",
|
|
|
|
|
"2 offre muséale individuel mucem \n",
|
|
|
|
|
"3 spectacle vivant mucem \n",
|
|
|
|
|
"4 spectacle vivant mucem "
|
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 228,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"events_theme= create_events_table()\n",
|
|
|
|
|
"events_theme.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 229,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"File path : bdc2324-data/1/1representations.csv\n",
|
|
|
|
|
"Shape : (36095, 16)\n",
|
|
|
|
|
"Number of columns : 14\n",
|
|
|
|
|
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
|
|
|
|
|
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
|
|
|
|
|
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
|
|
|
|
|
" dtype='object')\n",
|
|
|
|
|
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
|
|
|
|
|
"Shape : (65241, 7)\n",
|
|
|
|
|
"Number of columns : 5\n",
|
|
|
|
|
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
|
|
|
|
|
" 'max_filling'],\n",
|
|
|
|
|
" dtype='object')\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>event_id</th>\n",
|
|
|
|
|
" <th>id_representation_cap</th>\n",
|
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
|
" <th>category_id</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>12384</td>\n",
|
|
|
|
|
" <td>123058</td>\n",
|
|
|
|
|
" <td>84820</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>2514</td>\n",
|
|
|
|
|
" <td>269</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>384</td>\n",
|
|
|
|
|
" <td>269</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>2515</td>\n",
|
|
|
|
|
" <td>269</td>\n",
|
|
|
|
|
" <td>10</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>383</td>\n",
|
|
|
|
|
" <td>269</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" event_id id_representation_cap representation_id category_id\n",
|
|
|
|
|
"0 12384 123058 84820 2\n",
|
|
|
|
|
"1 37 2514 269 2\n",
|
|
|
|
|
"2 37 384 269 5\n",
|
|
|
|
|
"3 37 2515 269 10\n",
|
|
|
|
|
"4 37 383 269 1"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 229,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"representation_theme = create_representations_table()\n",
|
|
|
|
|
"representation_theme.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Create uniform product database "
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 230,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"def uniform_product_df():\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" This function returns the uniform product dataset\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" print(\"Products theme columns : \", products_theme.columns)\n",
|
|
|
|
|
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
|
|
|
|
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" products_global = products_theme.merge(representation_theme, how='left',\n",
|
|
|
|
|
" on= [\"representation_id\", \"category_id\"])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
|
|
|
|
|
" suffixes = (\"_representation\", \"_event\"))\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" products_global = order_columns_id(products_global)\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" # remove useless columns \n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" return products_global"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 231,
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"name": "stdout",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
|
|
|
|
|
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
|
|
|
|
|
" 'is_full_price', 'name_categories'],\n",
|
|
|
|
|
" dtype='object')\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
|
|
|
|
|
" 'category_id'],\n",
|
|
|
|
|
" dtype='object')\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
|
|
|
|
|
" 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
|
|
|
|
|
" 'name_seasons', 'name_event_types', 'name_facilities'],\n",
|
|
|
|
|
" dtype='object')\n"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
|
},
|
2024-02-04 16:02:50 +01:00
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>id_products</th>\n",
|
|
|
|
|
" <th>representation_id</th>\n",
|
|
|
|
|
" <th>pricing_formula_id</th>\n",
|
|
|
|
|
" <th>category_id</th>\n",
|
|
|
|
|
" <th>products_group_id</th>\n",
|
|
|
|
|
" <th>product_pack_id</th>\n",
|
|
|
|
|
" <th>event_id</th>\n",
|
|
|
|
|
" <th>id_representation_cap</th>\n",
|
|
|
|
|
" <th>season_id</th>\n",
|
|
|
|
|
" <th>facility_id</th>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <th>...</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>event_type_key_id</th>\n",
|
|
|
|
|
" <th>facility_key_id</th>\n",
|
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
|
" <th>amount</th>\n",
|
|
|
|
|
" <th>is_full_price</th>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <th>name_categories</th>\n",
|
|
|
|
|
" <th>name_events</th>\n",
|
|
|
|
|
" <th>name_seasons</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>name_facilities</th>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>10682</td>\n",
|
|
|
|
|
" <td>914</td>\n",
|
|
|
|
|
" <td>114</td>\n",
|
|
|
|
|
" <td>41</td>\n",
|
|
|
|
|
" <td>10655</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>132</td>\n",
|
|
|
|
|
" <td>8789</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>4</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>indiv activité tr</td>\n",
|
|
|
|
|
" <td>visite-jeu \"le classico des minots\" (1h30)</td>\n",
|
|
|
|
|
" <td>2017</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>478</td>\n",
|
|
|
|
|
" <td>273</td>\n",
|
|
|
|
|
" <td>131</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>471</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>390</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>9.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>billet mucem picasso</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>20873</td>\n",
|
|
|
|
|
" <td>275</td>\n",
|
|
|
|
|
" <td>137</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>20825</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>395</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>11.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>billet mucem picasso</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>157142</td>\n",
|
|
|
|
|
" <td>82519</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>5</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>156773</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>12365</td>\n",
|
|
|
|
|
" <td>120199</td>\n",
|
|
|
|
|
" <td>1754</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>4</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>indiv entrées tr</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>1341</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" <td>93</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>1175</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>8</td>\n",
|
|
|
|
|
" <td>21</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>6</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>8.5</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>non défini</td>\n",
|
|
|
|
|
" <td>2017</td>\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" <td>non défini</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
"<p>5 rows × 21 columns</p>\n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" id_products representation_id pricing_formula_id category_id \\\n",
|
|
|
|
|
"0 10682 914 114 41 \n",
|
|
|
|
|
"1 478 273 131 1 \n",
|
|
|
|
|
"2 20873 275 137 1 \n",
|
|
|
|
|
"3 157142 82519 9 5 \n",
|
|
|
|
|
"4 1341 9 93 1 \n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
" products_group_id product_pack_id event_id id_representation_cap \\\n",
|
|
|
|
|
"0 10655 1 132 8789 \n",
|
|
|
|
|
"1 471 1 37 390 \n",
|
|
|
|
|
"2 20825 1 37 395 \n",
|
|
|
|
|
"3 156773 1 12365 120199 \n",
|
|
|
|
|
"4 1175 1 8 21 \n",
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
" season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
|
|
|
|
|
"0 4 1 ... 5 1 1 \n",
|
|
|
|
|
"1 2 1 ... 2 1 1 \n",
|
|
|
|
|
"2 2 1 ... 2 1 1 \n",
|
|
|
|
|
"3 1754 1 ... 4 1 1 \n",
|
|
|
|
|
"4 4 1 ... 6 1 1 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" amount is_full_price name_categories \\\n",
|
|
|
|
|
"0 9.0 False indiv activité tr \n",
|
|
|
|
|
"1 9.5 False indiv entrées tp \n",
|
|
|
|
|
"2 11.5 False indiv entrées tp \n",
|
|
|
|
|
"3 8.0 False indiv entrées tr \n",
|
|
|
|
|
"4 8.5 False indiv entrées tp \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_events name_seasons \\\n",
|
|
|
|
|
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
|
|
|
|
|
"1 billet mucem picasso 2016 \n",
|
|
|
|
|
"2 billet mucem picasso 2016 \n",
|
|
|
|
|
"3 NaN NaN \n",
|
|
|
|
|
"4 non défini 2017 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_event_types name_facilities \n",
|
|
|
|
|
"0 offre muséale individuel mucem \n",
|
|
|
|
|
"1 offre muséale individuel mucem \n",
|
|
|
|
|
"2 offre muséale individuel mucem \n",
|
|
|
|
|
"3 offre muséale individuel mucem \n",
|
|
|
|
|
"4 non défini mucem \n",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"\n",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
"[5 rows x 21 columns]"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 231,
|
2024-02-04 16:02:50 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"products_global = uniform_product_df()\n",
|
|
|
|
|
"products_global.head()"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
]
|
2024-02-05 12:51:35 +01:00
|
|
|
|
},
|
2024-02-06 22:09:08 +01:00
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 232,
|
2024-02-06 22:09:08 +01:00
|
|
|
|
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Fusion liée au product\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Selection des variables d'intérêts\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'category_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
|
2024-02-06 22:09:08 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-10 19:26:46 +01:00
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "d7c3668a-c016-4bd0-837e-04af328ff14f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Construction des variables explicatives"
|
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-10 19:53:59 +01:00
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "314f1b7f-ae48-4c6f-8469-9ce879043243",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"## KPI campaigns"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 233,
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def campaigns_kpi_function(campaigns_information = None):\n",
|
|
|
|
|
" # Nombre de campagnes de mails\n",
|
|
|
|
|
" nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
|
|
|
|
" nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
|
|
|
|
|
" # Temps d'ouverture en min moyen \n",
|
|
|
|
|
" campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
|
|
|
|
|
" time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Nombre de mail ouvert \n",
|
|
|
|
|
" opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
|
|
|
|
|
" opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
|
|
|
|
|
" opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
|
|
|
|
|
" opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Fusion des indicateurs\n",
|
|
|
|
|
" campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
|
|
|
|
|
" campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Remplir les NaN : nb_campaigns_opened\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" campaigns_reduced['nb_campaigns_opened'].fillna(0)\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"\n",
|
|
|
|
|
" # Remplir les NaT : time_to_open (??)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return campaigns_reduced\n",
|
|
|
|
|
" "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 234,
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
|
|
|
|
|
"metadata": {},
|
2024-02-10 23:19:35 +01:00
|
|
|
|
"outputs": [],
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 235,
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>nb_campaigns</th>\n",
|
|
|
|
|
" <th>nb_campaigns_opened</th>\n",
|
|
|
|
|
" <th>time_to_open</th>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>4</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>222</td>\n",
|
|
|
|
|
" <td>124.0</td>\n",
|
|
|
|
|
" <td>1 days 00:28:30.169354838</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>7.0</td>\n",
|
|
|
|
|
" <td>1 days 04:31:01.428571428</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>6</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>20</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"0 2 4 NaN NaT\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"1 3 222 124.0 1 days 00:28:30.169354838\n",
|
|
|
|
|
"2 4 7 7.0 1 days 04:31:01.428571428\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"3 5 4 NaN NaT\n",
|
|
|
|
|
"4 6 20 NaN NaT"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 235,
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"df1_campaigns_kpi.head()"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## KPI tickets"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 236,
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def tickets_kpi_function(tickets_information = None):\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" tickets_information_copy = tickets_information.copy()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Dummy : Canal de vente en ligne\n",
|
|
|
|
|
" liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n",
|
|
|
|
|
" tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Proportion de vente en ligne\n",
|
|
|
|
|
" prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()\n",
|
|
|
|
|
" prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n",
|
2024-02-11 23:54:40 +01:00
|
|
|
|
"\n",
|
|
|
|
|
" # Average amount\n",
|
|
|
|
|
" avg_amount = (tickets_information_copy.groupby([\"event_type_id\", 'name_event_types'])\n",
|
|
|
|
|
" .agg({\"amount\" : \"mean\"}).reset_index()\n",
|
|
|
|
|
" .rename(columns = {'amount' : 'avg_amount'}))\n",
|
|
|
|
|
"\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" .groupby(['customer_id', 'event_type_id']) \n",
|
|
|
|
|
" .agg({'ticket_id': 'count', \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" 'purchase_id' : 'nunique',\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" 'amount' : 'sum',\n",
|
|
|
|
|
" 'supplier_name': 'nunique',\n",
|
|
|
|
|
" 'vente_internet' : 'max',\n",
|
|
|
|
|
" 'purchase_date' : ['min', 'max']})\n",
|
|
|
|
|
" .reset_index()\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" 'purchase_id_nunique' : 'nb_purchases',\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" 'amount_sum' : 'total_amount',\n",
|
|
|
|
|
" 'supplier_name_nunique' : 'nb_suppliers', \n",
|
|
|
|
|
" 'customer_id_' : 'customer_id',\n",
|
|
|
|
|
" 'event_type_id_' : 'event_type_id'}, inplace = True)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" # Convertir date et en chiffre\n",
|
|
|
|
|
" max_date = tickets_kpi['purchase_date_max'].max()\n",
|
|
|
|
|
" tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')\n",
|
|
|
|
|
" tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')\n",
|
|
|
|
|
" tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-11 23:54:40 +01:00
|
|
|
|
" tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')\n",
|
|
|
|
|
"\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" return tickets_kpi\n",
|
|
|
|
|
" "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 237,
|
2024-02-10 22:24:37 +01:00
|
|
|
|
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 238,
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
|
" <th>nb_tickets</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>nb_purchases</th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <th>total_amount</th>\n",
|
|
|
|
|
" <th>nb_suppliers</th>\n",
|
|
|
|
|
" <th>vente_internet_max</th>\n",
|
|
|
|
|
" <th>purchase_date_min</th>\n",
|
|
|
|
|
" <th>purchase_date_max</th>\n",
|
|
|
|
|
" <th>time_between_purchase</th>\n",
|
|
|
|
|
" <th>nb_tickets_internet</th>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>avg_amount</th>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>384226</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>194790</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>2686540.5</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3262.190868</td>\n",
|
|
|
|
|
" <td>4.179306</td>\n",
|
|
|
|
|
" <td>3258.011562</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>51.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>6.150659</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>1</th>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>453242</td>\n",
|
|
|
|
|
" <td>228945</td>\n",
|
|
|
|
|
" <td>3248965.5</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>6</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3698.198229</td>\n",
|
|
|
|
|
" <td>5.221840</td>\n",
|
|
|
|
|
" <td>3692.976389</td>\n",
|
|
|
|
|
" <td>2988.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>7.762474</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>201750</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>107110</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>1459190.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>6</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3803.369792</td>\n",
|
|
|
|
|
" <td>0.146331</td>\n",
|
|
|
|
|
" <td>3803.223461</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>4.452618</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>6</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>217356</td>\n",
|
|
|
|
|
" <td>111786</td>\n",
|
|
|
|
|
" <td>1435871.5</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2502.715509</td>\n",
|
|
|
|
|
" <td>1408.715532</td>\n",
|
|
|
|
|
" <td>1093.999977</td>\n",
|
|
|
|
|
" <td>5.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>formule adhésion</td>\n",
|
|
|
|
|
" <td>6.439463</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>143</td>\n",
|
|
|
|
|
" <td>143</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2041.274549</td>\n",
|
|
|
|
|
" <td>1340.308160</td>\n",
|
|
|
|
|
" <td>700.966389</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>6.150659</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
|
|
|
|
|
"0 1 2 384226 194790 2686540.5 \n",
|
|
|
|
|
"1 1 4 453242 228945 3248965.5 \n",
|
|
|
|
|
"2 1 5 201750 107110 1459190.0 \n",
|
|
|
|
|
"3 1 6 217356 111786 1435871.5 \n",
|
|
|
|
|
"4 2 2 143 143 0.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" nb_suppliers vente_internet_max purchase_date_min purchase_date_max \\\n",
|
|
|
|
|
"0 7 1 3262.190868 4.179306 \n",
|
|
|
|
|
"1 6 1 3698.198229 5.221840 \n",
|
|
|
|
|
"2 6 1 3803.369792 0.146331 \n",
|
|
|
|
|
"3 5 1 2502.715509 1408.715532 \n",
|
|
|
|
|
"4 1 0 2041.274549 1340.308160 \n",
|
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" time_between_purchase nb_tickets_internet name_event_types \\\n",
|
|
|
|
|
"0 3258.011562 51.0 offre muséale individuel \n",
|
|
|
|
|
"1 3692.976389 2988.0 spectacle vivant \n",
|
|
|
|
|
"2 3803.223461 9.0 offre muséale groupe \n",
|
|
|
|
|
"3 1093.999977 5.0 formule adhésion \n",
|
|
|
|
|
"4 700.966389 0.0 offre muséale individuel \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" avg_amount \n",
|
|
|
|
|
"0 6.150659 \n",
|
|
|
|
|
"1 7.762474 \n",
|
|
|
|
|
"2 4.452618 \n",
|
|
|
|
|
"3 6.439463 \n",
|
|
|
|
|
"4 6.150659 "
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 238,
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"df1_tickets_kpi.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 239,
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Exportation vers 'projet-bdc2324-team1'\n",
|
|
|
|
|
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
|
|
|
|
|
"FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n",
|
|
|
|
|
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
|
|
|
" df1_tickets_kpi.to_csv(file_out, index = False)"
|
2024-02-10 22:24:37 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "f1d7f7ba-361b-467d-b375-b09c149185f7",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Alexis' work"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-11 11:47:58 +01:00
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 240,
|
|
|
|
|
"id": "484becad-0390-48a8-923b-b03a4facc7ae",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>ticket_id</th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>purchase_id</th>\n",
|
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
|
" <th>category_id</th>\n",
|
|
|
|
|
" <th>supplier_name</th>\n",
|
|
|
|
|
" <th>purchase_date</th>\n",
|
|
|
|
|
" <th>type_of_ticket_name</th>\n",
|
|
|
|
|
" <th>amount</th>\n",
|
|
|
|
|
" <th>children</th>\n",
|
|
|
|
|
" <th>is_full_price</th>\n",
|
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>name_facilities</th>\n",
|
|
|
|
|
" <th>name_categories</th>\n",
|
|
|
|
|
" <th>name_events</th>\n",
|
|
|
|
|
" <th>name_seasons</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>13070859</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" <td>5107462</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>13</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv prog enfant</td>\n",
|
|
|
|
|
" <td>l'école des magiciens</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>13070860</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" <td>5107462</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>13</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv prog enfant</td>\n",
|
|
|
|
|
" <td>l'école des magiciens</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>13070861</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" <td>5107462</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>13</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv prog enfant</td>\n",
|
|
|
|
|
" <td>l'école des magiciens</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>13070862</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" <td>5107462</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>13</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv prog enfant</td>\n",
|
|
|
|
|
" <td>l'école des magiciens</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>13070863</td>\n",
|
|
|
|
|
" <td>48187</td>\n",
|
|
|
|
|
" <td>5107462</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>13</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv prog enfant</td>\n",
|
|
|
|
|
" <td>l'école des magiciens</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1826667</th>\n",
|
|
|
|
|
" <td>20662815</td>\n",
|
|
|
|
|
" <td>1256135</td>\n",
|
|
|
|
|
" <td>8007697</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2023-11-08 17:23:54+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>11.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1826668</th>\n",
|
|
|
|
|
" <td>20662816</td>\n",
|
|
|
|
|
" <td>1256136</td>\n",
|
|
|
|
|
" <td>8007698</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2023-11-08 18:32:18+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>11.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1826669</th>\n",
|
|
|
|
|
" <td>20662817</td>\n",
|
|
|
|
|
" <td>1256136</td>\n",
|
|
|
|
|
" <td>8007698</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2023-11-08 18:32:18+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>11.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1826670</th>\n",
|
|
|
|
|
" <td>20662818</td>\n",
|
|
|
|
|
" <td>1256137</td>\n",
|
|
|
|
|
" <td>8007699</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2023-11-08 19:30:28+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>11.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1826671</th>\n",
|
|
|
|
|
" <td>20662819</td>\n",
|
|
|
|
|
" <td>1256137</td>\n",
|
|
|
|
|
" <td>8007699</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>vente en ligne</td>\n",
|
|
|
|
|
" <td>2023-11-08 19:30:28+00:00</td>\n",
|
|
|
|
|
" <td>Atelier</td>\n",
|
|
|
|
|
" <td>11.0</td>\n",
|
|
|
|
|
" <td>pricing_formula</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>mucem</td>\n",
|
|
|
|
|
" <td>indiv entrées tp</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2023</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"<p>1826672 rows × 16 columns</p>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" ticket_id customer_id purchase_id event_type_id category_id \\\n",
|
|
|
|
|
"0 13070859 48187 5107462 4 13 \n",
|
|
|
|
|
"1 13070860 48187 5107462 4 13 \n",
|
|
|
|
|
"2 13070861 48187 5107462 4 13 \n",
|
|
|
|
|
"3 13070862 48187 5107462 4 13 \n",
|
|
|
|
|
"4 13070863 48187 5107462 4 13 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"1826667 20662815 1256135 8007697 5 1 \n",
|
|
|
|
|
"1826668 20662816 1256136 8007698 5 1 \n",
|
|
|
|
|
"1826669 20662817 1256136 8007698 5 1 \n",
|
|
|
|
|
"1826670 20662818 1256137 8007699 5 1 \n",
|
|
|
|
|
"1826671 20662819 1256137 8007699 5 1 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" supplier_name purchase_date type_of_ticket_name amount \\\n",
|
|
|
|
|
"0 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
|
|
|
|
|
"1 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
|
|
|
|
|
"2 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
|
|
|
|
|
"3 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
|
|
|
|
|
"4 vente en ligne 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
|
|
|
|
|
"... ... ... ... ... \n",
|
|
|
|
|
"1826667 vente en ligne 2023-11-08 17:23:54+00:00 Atelier 11.0 \n",
|
|
|
|
|
"1826668 vente en ligne 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
|
|
|
|
|
"1826669 vente en ligne 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
|
|
|
|
|
"1826670 vente en ligne 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
|
|
|
|
|
"1826671 vente en ligne 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" children is_full_price name_event_types name_facilities \\\n",
|
|
|
|
|
"0 pricing_formula False spectacle vivant mucem \n",
|
|
|
|
|
"1 pricing_formula False spectacle vivant mucem \n",
|
|
|
|
|
"2 pricing_formula False spectacle vivant mucem \n",
|
|
|
|
|
"3 pricing_formula False spectacle vivant mucem \n",
|
|
|
|
|
"4 pricing_formula False spectacle vivant mucem \n",
|
|
|
|
|
"... ... ... ... ... \n",
|
|
|
|
|
"1826667 pricing_formula False offre muséale groupe mucem \n",
|
|
|
|
|
"1826668 pricing_formula False offre muséale groupe mucem \n",
|
|
|
|
|
"1826669 pricing_formula False offre muséale groupe mucem \n",
|
|
|
|
|
"1826670 pricing_formula False offre muséale groupe mucem \n",
|
|
|
|
|
"1826671 pricing_formula False offre muséale groupe mucem \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" name_categories name_events name_seasons \n",
|
|
|
|
|
"0 indiv prog enfant l'école des magiciens 2018 \n",
|
|
|
|
|
"1 indiv prog enfant l'école des magiciens 2018 \n",
|
|
|
|
|
"2 indiv prog enfant l'école des magiciens 2018 \n",
|
|
|
|
|
"3 indiv prog enfant l'école des magiciens 2018 \n",
|
|
|
|
|
"4 indiv prog enfant l'école des magiciens 2018 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"1826667 indiv entrées tp NaN 2023 \n",
|
|
|
|
|
"1826668 indiv entrées tp NaN 2023 \n",
|
|
|
|
|
"1826669 indiv entrées tp NaN 2023 \n",
|
|
|
|
|
"1826670 indiv entrées tp NaN 2023 \n",
|
|
|
|
|
"1826671 indiv entrées tp NaN 2023 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[1826672 rows x 16 columns]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 240,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Look at categories\n",
|
|
|
|
|
"df1_products_purchased_reduced\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 241,
|
|
|
|
|
"id": "abb171a5-145f-40d0-91af-c51db4593d04",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"array([ 13, 16, 10, 1, 3, 2, 4, 6982, 5, 28, 8,\n",
|
|
|
|
|
" 41, 15, 39, 17, 9, 26, 2252, 6, 7, 1108, 2395,\n",
|
|
|
|
|
" 30, 2450, 14, 11])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 241,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_products_purchased_reduced['category_id'].unique()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 242,
|
|
|
|
|
"id": "750c5753-ea45-4deb-a934-994568185013",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"array(['indiv prog enfant', 'indiv activité enfant', 'indiv entrées fa',\n",
|
|
|
|
|
" 'indiv entrées tp', 'indiv prog tp', 'indiv entrées gr',\n",
|
|
|
|
|
" 'indiv prog tr', nan, 'indiv entrées tr', 'indiv activité tp',\n",
|
|
|
|
|
" 'indiv prog gr', 'indiv activité tr', 'groupe autonome entrées tp',\n",
|
|
|
|
|
" 'indiv activité gr', 'groupe forfait scolaire',\n",
|
|
|
|
|
" 'groupe forfait entrées gr', 'en nb entrées tp',\n",
|
|
|
|
|
" 'groupe autonome gr', 'groupe forfait entrées tr',\n",
|
|
|
|
|
" 'groupe autonome entrées gr', 'groupe forfait adulte',\n",
|
|
|
|
|
" 'groupe autonome adulte', 'en nb entrées gr',\n",
|
|
|
|
|
" 'groupe forfait etudiant', 'en nb entrées tr',\n",
|
|
|
|
|
" 'groupe autonome entrées tr'], dtype=object)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 242,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_products_purchased_reduced['name_categories'].unique()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 243,
|
|
|
|
|
"id": "7a2c3124-26d0-49e1-9bb4-d7ff58053454",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>event_type_id</th>\n",
|
|
|
|
|
" <th>nb_categories</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>14</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>6</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" customer_id event_type_id nb_categories\n",
|
|
|
|
|
"0 1 2 14\n",
|
|
|
|
|
"1 1 4 9\n",
|
|
|
|
|
"2 1 5 5\n",
|
|
|
|
|
"3 1 6 9\n",
|
|
|
|
|
"4 2 2 1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 243,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"nb_cat = (df1_products_purchased_reduced.groupby(by=['customer_id', 'event_type_id'])\n",
|
|
|
|
|
" .agg({'category_id' : 'nunique'}).reset_index()\n",
|
|
|
|
|
" .rename(columns = {'category_id' : 'nb_categories'}))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"nb_cat.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 244,
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"id": "83230baa-9a8a-4614-b629-e99c2505c696",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>birthdate</th>\n",
|
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
|
" <th>is_partner</th>\n",
|
|
|
|
|
" <th>gender</th>\n",
|
|
|
|
|
" <th>is_email_true</th>\n",
|
|
|
|
|
" <th>opt_in</th>\n",
|
|
|
|
|
" <th>structure_id</th>\n",
|
|
|
|
|
" <th>profession</th>\n",
|
|
|
|
|
" <th>language</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>total_amount</th>\n",
|
|
|
|
|
" <th>nb_suppliers</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <th>vente_internet_max</th>\n",
|
|
|
|
|
" <th>purchase_date_min</th>\n",
|
|
|
|
|
" <th>purchase_date_max</th>\n",
|
|
|
|
|
" <th>time_between_purchase</th>\n",
|
|
|
|
|
" <th>nb_tickets_internet</th>\n",
|
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>avg_amount</th>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>nb_categories</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>0</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2686540.5</td>\n",
|
|
|
|
|
" <td>7.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3262.190868</td>\n",
|
|
|
|
|
" <td>4.179306</td>\n",
|
|
|
|
|
" <td>3258.011562</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>51.0</td>\n",
|
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>6.150659</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>14.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>1</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1435871.5</td>\n",
|
|
|
|
|
" <td>5.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2502.715509</td>\n",
|
|
|
|
|
" <td>1408.715532</td>\n",
|
|
|
|
|
" <td>1093.999977</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>5.0</td>\n",
|
|
|
|
|
" <td>formule adhésion</td>\n",
|
|
|
|
|
" <td>6.439463</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>2</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3248965.5</td>\n",
|
|
|
|
|
" <td>6.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3698.198229</td>\n",
|
|
|
|
|
" <td>5.221840</td>\n",
|
|
|
|
|
" <td>3692.976389</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>2988.0</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>7.762474</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>3</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1459190.0</td>\n",
|
|
|
|
|
" <td>6.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3803.369792</td>\n",
|
|
|
|
|
" <td>0.146331</td>\n",
|
|
|
|
|
" <td>3803.223461</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>4.452618</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>5.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>4</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1705.261192</td>\n",
|
|
|
|
|
" <td>1456.333715</td>\n",
|
|
|
|
|
" <td>248.927477</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>formule adhésion</td>\n",
|
|
|
|
|
" <td>6.439463</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"<p>5 rows × 38 columns</p>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
|
|
|
|
"0 1 NaN 2 False 2 True \n",
|
|
|
|
|
"1 1 NaN 2 False 2 True \n",
|
|
|
|
|
"2 1 NaN 2 False 2 True \n",
|
|
|
|
|
"3 1 NaN 2 False 2 True \n",
|
|
|
|
|
"4 2 NaN 2 False 1 True \n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" opt_in structure_id profession language ... total_amount nb_suppliers \\\n",
|
|
|
|
|
"0 False NaN NaN NaN ... 2686540.5 7.0 \n",
|
|
|
|
|
"1 False NaN NaN NaN ... 1435871.5 5.0 \n",
|
|
|
|
|
"2 False NaN NaN NaN ... 3248965.5 6.0 \n",
|
|
|
|
|
"3 False NaN NaN NaN ... 1459190.0 6.0 \n",
|
|
|
|
|
"4 True NaN NaN NaN ... 0.0 1.0 \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
|
|
|
|
"0 1.0 3262.190868 4.179306 \n",
|
|
|
|
|
"1 1.0 2502.715509 1408.715532 \n",
|
|
|
|
|
"2 1.0 3698.198229 5.221840 \n",
|
|
|
|
|
"3 1.0 3803.369792 0.146331 \n",
|
|
|
|
|
"4 0.0 1705.261192 1456.333715 \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" time_between_purchase nb_tickets_internet name_event_types \\\n",
|
|
|
|
|
"0 3258.011562 51.0 offre muséale individuel \n",
|
|
|
|
|
"1 1093.999977 5.0 formule adhésion \n",
|
|
|
|
|
"2 3692.976389 2988.0 spectacle vivant \n",
|
|
|
|
|
"3 3803.223461 9.0 offre muséale groupe \n",
|
|
|
|
|
"4 248.927477 0.0 formule adhésion \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" avg_amount nb_categories \n",
|
|
|
|
|
"0 6.150659 14.0 \n",
|
|
|
|
|
"1 6.439463 9.0 \n",
|
|
|
|
|
"2 7.762474 9.0 \n",
|
|
|
|
|
"3 4.452618 5.0 \n",
|
|
|
|
|
"4 6.439463 1.0 \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"[5 rows x 38 columns]"
|
2024-02-11 11:47:58 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 244,
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"## Add customer information\n",
|
|
|
|
|
"df1_customer = (df1_customerplus_clean.merge(df1_tickets_kpi, how = \"left\", on='customer_id')\n",
|
|
|
|
|
" .sort_values(by='customer_id', ascending=True))\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"\n",
|
|
|
|
|
"df1_customer = df1_customer.merge(nb_cat, how='left', on=['customer_id', 'event_type_id'])\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"df1_customer.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 245,
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"id": "433921de-03ad-4024-9462-ecd267db1756",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>birthdate</th>\n",
|
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
|
" <th>is_partner</th>\n",
|
|
|
|
|
" <th>gender</th>\n",
|
|
|
|
|
" <th>is_email_true</th>\n",
|
|
|
|
|
" <th>opt_in</th>\n",
|
|
|
|
|
" <th>structure_id</th>\n",
|
|
|
|
|
" <th>profession</th>\n",
|
|
|
|
|
" <th>language</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>purchase_date_min</th>\n",
|
|
|
|
|
" <th>purchase_date_max</th>\n",
|
|
|
|
|
" <th>time_between_purchase</th>\n",
|
|
|
|
|
" <th>nb_tickets_internet</th>\n",
|
|
|
|
|
" <th>name_event_types</th>\n",
|
|
|
|
|
" <th>avg_amount</th>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <th>nb_categories</th>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <th>nb_campaigns</th>\n",
|
|
|
|
|
" <th>nb_campaigns_opened</th>\n",
|
|
|
|
|
" <th>time_to_open</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>True</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3262.190868</td>\n",
|
|
|
|
|
" <td>4.179306</td>\n",
|
|
|
|
|
" <td>3258.011562</td>\n",
|
|
|
|
|
" <td>51.0</td>\n",
|
|
|
|
|
" <td>offre muséale individuel</td>\n",
|
|
|
|
|
" <td>6.150659</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>14.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2502.715509</td>\n",
|
|
|
|
|
" <td>1408.715532</td>\n",
|
|
|
|
|
" <td>1093.999977</td>\n",
|
|
|
|
|
" <td>5.0</td>\n",
|
|
|
|
|
" <td>formule adhésion</td>\n",
|
|
|
|
|
" <td>6.439463</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>True</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3698.198229</td>\n",
|
|
|
|
|
" <td>5.221840</td>\n",
|
|
|
|
|
" <td>3692.976389</td>\n",
|
|
|
|
|
" <td>2988.0</td>\n",
|
|
|
|
|
" <td>spectacle vivant</td>\n",
|
|
|
|
|
" <td>7.762474</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>9.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>False</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>3803.369792</td>\n",
|
|
|
|
|
" <td>0.146331</td>\n",
|
|
|
|
|
" <td>3803.223461</td>\n",
|
|
|
|
|
" <td>9.0</td>\n",
|
|
|
|
|
" <td>offre muséale groupe</td>\n",
|
|
|
|
|
" <td>4.452618</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>5.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>2</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>1705.261192</td>\n",
|
|
|
|
|
" <td>1456.333715</td>\n",
|
|
|
|
|
" <td>248.927477</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>formule adhésion</td>\n",
|
|
|
|
|
" <td>6.439463</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>1.0</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>4.0</td>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"<p>5 rows × 41 columns</p>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"0 1 NaN 2 False 2 True \n",
|
|
|
|
|
"1 1 NaN 2 False 2 True \n",
|
|
|
|
|
"2 1 NaN 2 False 2 True \n",
|
|
|
|
|
"3 1 NaN 2 False 2 True \n",
|
|
|
|
|
"4 2 NaN 2 False 1 True \n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" opt_in structure_id profession language ... purchase_date_min \\\n",
|
|
|
|
|
"0 False NaN NaN NaN ... 3262.190868 \n",
|
|
|
|
|
"1 False NaN NaN NaN ... 2502.715509 \n",
|
|
|
|
|
"2 False NaN NaN NaN ... 3698.198229 \n",
|
|
|
|
|
"3 False NaN NaN NaN ... 3803.369792 \n",
|
|
|
|
|
"4 True NaN NaN NaN ... 1705.261192 \n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" purchase_date_max time_between_purchase nb_tickets_internet \\\n",
|
|
|
|
|
"0 4.179306 3258.011562 51.0 \n",
|
|
|
|
|
"1 1408.715532 1093.999977 5.0 \n",
|
|
|
|
|
"2 5.221840 3692.976389 2988.0 \n",
|
|
|
|
|
"3 0.146331 3803.223461 9.0 \n",
|
|
|
|
|
"4 1456.333715 248.927477 0.0 \n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
" name_event_types avg_amount nb_categories nb_campaigns \\\n",
|
|
|
|
|
"0 offre muséale individuel 6.150659 14.0 NaN \n",
|
|
|
|
|
"1 formule adhésion 6.439463 9.0 NaN \n",
|
|
|
|
|
"2 spectacle vivant 7.762474 9.0 NaN \n",
|
|
|
|
|
"3 offre muséale groupe 4.452618 5.0 NaN \n",
|
|
|
|
|
"4 formule adhésion 6.439463 1.0 4.0 \n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" nb_campaigns_opened time_to_open \n",
|
|
|
|
|
"0 NaN NaT \n",
|
|
|
|
|
"1 NaN NaT \n",
|
|
|
|
|
"2 NaN NaT \n",
|
|
|
|
|
"3 NaN NaT \n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"4 NaN NaT \n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"\n",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"[5 rows x 41 columns]"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 245,
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"# Add campaigns information\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df1_customer = df1_customer.merge(df1_campaigns_kpi, how='left', on='customer_id')\n",
|
|
|
|
|
"df1_customer.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"execution_count": 246,
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"id": "25e54131-6835-4e94-86d3-1a78520ed7bc",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## Exportation\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Exportation vers 'projet-bdc2324-team1'\n",
|
|
|
|
|
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
|
|
|
|
|
"FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - customer_event.csv\"\n",
|
|
|
|
|
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
|
|
|
" df1_customer.to_csv(file_out, index = False)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "edae177c-1247-454d-b3d1-08fea37001f7",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## End of Alexis' work"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 46,
|
|
|
|
|
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"source": [
|
|
|
|
|
"# Fusion avec KPI campaigns liés au customer\n",
|
2024-02-11 11:47:58 +01:00
|
|
|
|
"#df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
|
|
|
|
"#df1_customer.head()"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"execution_count": null,
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"id": "a89fad43-ee68-4081-9384-3e9f08ec6a59",
|
|
|
|
|
"metadata": {},
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
|
|
|
|
|
"print(\"shape : \", df1_customer_product.shape)\n",
|
|
|
|
|
"df1_customer_product.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "a19fec00-4ece-400c-937c-ce5cd8daccfd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Fusion des bases locales"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 63,
|
|
|
|
|
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Fusion avec KPI liés au customer\n",
|
|
|
|
|
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Fill NaN values\n",
|
|
|
|
|
"df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 64,
|
|
|
|
|
"id": "d53825e4-6453-45bc-94f2-7b2504ec4afb",
|
|
|
|
|
"metadata": {},
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>customer_id</th>\n",
|
|
|
|
|
" <th>birthdate</th>\n",
|
|
|
|
|
" <th>street_id</th>\n",
|
|
|
|
|
" <th>is_partner</th>\n",
|
|
|
|
|
" <th>gender</th>\n",
|
|
|
|
|
" <th>is_email_true</th>\n",
|
|
|
|
|
" <th>opt_in</th>\n",
|
|
|
|
|
" <th>structure_id</th>\n",
|
|
|
|
|
" <th>profession</th>\n",
|
|
|
|
|
" <th>language</th>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <th>...</th>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <th>average_ticket_basket</th>\n",
|
|
|
|
|
" <th>total_price</th>\n",
|
|
|
|
|
" <th>purchase_count</th>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <th>first_buying_date</th>\n",
|
|
|
|
|
" <th>country</th>\n",
|
|
|
|
|
" <th>age</th>\n",
|
|
|
|
|
" <th>tenant_id</th>\n",
|
|
|
|
|
" <th>nb_campaigns</th>\n",
|
|
|
|
|
" <th>nb_campaigns_opened</th>\n",
|
|
|
|
|
" <th>time_to_open</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>12751</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>fr</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1311</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>12825</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>fr</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1311</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>11261</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>fr</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1311</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>13071</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>fr</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1311</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>653061</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>10</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
2024-02-10 22:24:37 +01:00
|
|
|
|
" <td>...</td>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
" <td>NaT</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1311</td>\n",
|
|
|
|
|
" <td>80.0</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>0 days 19:53:02.500000</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"<p>5 rows × 28 columns</p>\n",
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
|
|
|
|
"0 12751 NaN 2 False 1 True \n",
|
|
|
|
|
"1 12825 NaN 2 False 2 True \n",
|
|
|
|
|
"2 11261 NaN 2 False 1 True \n",
|
|
|
|
|
"3 13071 NaN 2 False 2 True \n",
|
|
|
|
|
"4 653061 NaN 10 False 2 True \n",
|
|
|
|
|
"\n",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
" opt_in structure_id profession language ... average_ticket_basket \\\n",
|
|
|
|
|
"0 True NaN NaN NaN ... NaN \n",
|
|
|
|
|
"1 True NaN NaN NaN ... NaN \n",
|
|
|
|
|
"2 True NaN NaN NaN ... NaN \n",
|
|
|
|
|
"3 True NaN NaN NaN ... NaN \n",
|
|
|
|
|
"4 False NaN NaN NaN ... NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" total_price purchase_count first_buying_date country age tenant_id \\\n",
|
|
|
|
|
"0 NaN 0 NaT fr NaN 1311 \n",
|
|
|
|
|
"1 NaN 0 NaT fr NaN 1311 \n",
|
|
|
|
|
"2 NaN 0 NaT fr NaN 1311 \n",
|
|
|
|
|
"3 NaN 0 NaT fr NaN 1311 \n",
|
|
|
|
|
"4 NaN 0 NaT NaN NaN 1311 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" nb_campaigns nb_campaigns_opened time_to_open \n",
|
|
|
|
|
"0 0.0 0.0 NaT \n",
|
|
|
|
|
"1 0.0 0.0 NaT \n",
|
|
|
|
|
"2 0.0 0.0 NaT \n",
|
|
|
|
|
"3 0.0 0.0 NaT \n",
|
|
|
|
|
"4 80.0 2.0 0 days 19:53:02.500000 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[5 rows x 28 columns]"
|
2024-02-10 19:53:59 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"execution_count": 64,
|
2024-02-10 19:53:59 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"df1_customer.head()"
|
2024-02-07 23:28:55 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"execution_count": 67,
|
|
|
|
|
"id": "1e42a790-b215-4107-a969-85005da06ebd",
|
2024-02-05 12:51:35 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
2024-02-06 22:09:08 +01:00
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"# Fusion avec KPI liés au comportement d'achat\n",
|
|
|
|
|
"df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Fill NaN values\n",
|
|
|
|
|
"df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)"
|
2024-02-07 23:28:55 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"execution_count": 66,
|
|
|
|
|
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
"metadata": {},
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',\n",
|
|
|
|
|
" 'total_amount', 'nb_suppliers', 'vente_internet_max',\n",
|
|
|
|
|
" 'purchase_date_min', 'purchase_date_max', 'time_between_purchase',\n",
|
|
|
|
|
" 'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',\n",
|
|
|
|
|
" 'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',\n",
|
|
|
|
|
" 'structure_id', 'profession', 'language', 'mcp_contact_id',\n",
|
|
|
|
|
" 'last_buying_date', 'max_price', 'ticket_sum', 'average_price',\n",
|
|
|
|
|
" 'fidelity', 'average_purchase_delay', 'average_price_basket',\n",
|
|
|
|
|
" 'average_ticket_basket', 'total_price', 'purchase_count',\n",
|
|
|
|
|
" 'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',\n",
|
|
|
|
|
" 'nb_campaigns_opened', 'time_to_open'],\n",
|
|
|
|
|
" dtype='object')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 66,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
2024-02-07 23:28:55 +01:00
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"df1_customer_product"
|
2024-02-07 23:28:55 +01:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"execution_count": 68,
|
|
|
|
|
"id": "ebf6d843-dcc0-4e83-b063-94806c0bac17",
|
2024-02-07 23:28:55 +01:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
2024-02-11 17:59:45 +01:00
|
|
|
|
"## Exportation\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Exportation vers 'projet-bdc2324-team1'\n",
|
|
|
|
|
"BUCKET_OUT = \"projet-bdc2324-team1\"\n",
|
|
|
|
|
"FILE_KEY_OUT_S3 = \"1_Output/Company 1 - Segmentation base.csv\"\n",
|
|
|
|
|
"FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
|
|
|
" df1_customer_product.to_csv(file_out, index = False)"
|
2024-02-06 22:09:08 +01:00
|
|
|
|
]
|
2024-02-04 16:02:50 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
2024-02-12 11:08:24 +01:00
|
|
|
|
"version": "3.11.6"
|
2024-02-04 16:02:50 +01:00
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|