BDC-team-1/Notebook_AR.ipynb

6820 lines
232 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "455cc769-1b3b-4fef-b395-e74a988ceed3",
"metadata": {},
"source": [
"## Notebook Alexis"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import s3fs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "30494c5e-9649-4fff-8708-617544188b20",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d",
"metadata": {},
"source": [
"### I. Analyse fichier 8"
]
},
{
"cell_type": "markdown",
"id": "f54ba449-2051-4acd-939d-d30abd5452fe",
"metadata": {},
"source": [
"This section describes the databases associated with company 8. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f1cce705-46e1-42de-8e93-2ee15312d288",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '8'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8campaign_stats.csv\n",
"bdc2324-data/8/8campaigns.csv\n",
"bdc2324-data/8/8categories.csv\n",
"bdc2324-data/8/8countries.csv\n",
"bdc2324-data/8/8currencies.csv\n",
"bdc2324-data/8/8customer_target_mappings.csv\n",
"bdc2324-data/8/8customersplus.csv\n",
"bdc2324-data/8/8event_types.csv\n",
"bdc2324-data/8/8events.csv\n",
"bdc2324-data/8/8facilities.csv\n",
"bdc2324-data/8/8link_stats.csv\n",
"bdc2324-data/8/8pricing_formulas.csv\n",
"bdc2324-data/8/8product_packs.csv\n",
"bdc2324-data/8/8products.csv\n",
"bdc2324-data/8/8products_groups.csv\n",
"bdc2324-data/8/8purchases.csv\n",
"bdc2324-data/8/8representation_category_capacities.csv\n",
"bdc2324-data/8/8representations.csv\n",
"bdc2324-data/8/8seasons.csv\n",
"bdc2324-data/8/8suppliers.csv\n",
"bdc2324-data/8/8target_types.csv\n",
"bdc2324-data/8/8targets.csv\n",
"bdc2324-data/8/8tickets.csv\n",
"bdc2324-data/8/8type_of_categories.csv\n",
"bdc2324-data/8/8type_of_pricing_formulas.csv\n",
"bdc2324-data/8/8type_ofs.csv\n"
]
}
],
"source": [
"# check the files in the directory\n",
"\n",
"objects = fs.ls(f'{BUCKET}/{directory_path}')\n",
"\n",
"for file in objects:\n",
" print(file)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(file_name):\n",
" \"\"\"\n",
" This function returns the file from s3 storage\n",
" \"\"\"\n",
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
" \n",
" print(\"Shape : \", df.shape)\n",
" return df\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ddd545ef-7e9f-4696-962a-115294991641",
"metadata": {},
"source": [
"#### Lookt at campaigns files"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0214d30d-5f83-498f-867f-e67b5793b731",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8campaigns.csv\n",
"Shape : (1689, 11)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>service_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>process_id</th>\n",
" <th>report_url</th>\n",
" <th>category</th>\n",
" <th>to_be_synced</th>\n",
" <th>identifier</th>\n",
" <th>sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>#LOUSFP RELANCE P'TITS LOU</td>\n",
" <td>1436</td>\n",
" <td>2022-02-01 15:22:53.564432+01:00</td>\n",
" <td>2022-02-01 15:22:53.564432+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>eaa32c96f620053cf442ad32258076b9</td>\n",
" <td>2022-01-31 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>#LOUSFP BRASSERIE ACHETEURS</td>\n",
" <td>1435</td>\n",
" <td>2022-02-01 15:22:53.572592+01:00</td>\n",
" <td>2022-02-01 15:22:53.572592+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>1f3202d820180a39f736f20fce790de8</td>\n",
" <td>2022-01-31 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>PRESSE. LOU/SF Paris - RDV et protocole</td>\n",
" <td>1433</td>\n",
" <td>2022-02-01 15:22:53.578426+01:00</td>\n",
" <td>2022-02-01 15:22:53.578426+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>b069b3415151fa7217e870017374de7c</td>\n",
" <td>2022-01-31 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>#LOUSFP ÉTUDIANTS</td>\n",
" <td>1432</td>\n",
" <td>2022-02-01 15:22:53.584235+01:00</td>\n",
" <td>2022-02-01 15:22:53.584235+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>56468d5607a5aaf1604ff5e15593b003</td>\n",
" <td>2022-01-27 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>#LOUSFP P'TITS LOU</td>\n",
" <td>1431</td>\n",
" <td>2022-02-01 15:22:53.590187+01:00</td>\n",
" <td>2022-02-01 15:22:53.590187+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>e11943a6031a0e6114ae69c257617980</td>\n",
" <td>2022-01-27 00:00:00+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name service_id \\\n",
"0 1 #LOUSFP RELANCE P'TITS LOU 1436 \n",
"1 2 #LOUSFP BRASSERIE ACHETEURS 1435 \n",
"2 3 PRESSE. LOU/SF Paris - RDV et protocole 1433 \n",
"3 4 #LOUSFP ÉTUDIANTS 1432 \n",
"4 5 #LOUSFP P'TITS LOU 1431 \n",
"\n",
" created_at updated_at \\\n",
"0 2022-02-01 15:22:53.564432+01:00 2022-02-01 15:22:53.564432+01:00 \n",
"1 2022-02-01 15:22:53.572592+01:00 2022-02-01 15:22:53.572592+01:00 \n",
"2 2022-02-01 15:22:53.578426+01:00 2022-02-01 15:22:53.578426+01:00 \n",
"3 2022-02-01 15:22:53.584235+01:00 2022-02-01 15:22:53.584235+01:00 \n",
"4 2022-02-01 15:22:53.590187+01:00 2022-02-01 15:22:53.590187+01:00 \n",
"\n",
" process_id report_url category to_be_synced \\\n",
"0 NaN NaN 0 False \n",
"1 NaN NaN 0 False \n",
"2 NaN NaN 0 False \n",
"3 NaN NaN 0 False \n",
"4 NaN NaN 0 False \n",
"\n",
" identifier sent_at \n",
"0 eaa32c96f620053cf442ad32258076b9 2022-01-31 00:00:00+01:00 \n",
"1 1f3202d820180a39f736f20fce790de8 2022-01-31 00:00:00+01:00 \n",
"2 b069b3415151fa7217e870017374de7c 2022-01-31 00:00:00+01:00 \n",
"3 56468d5607a5aaf1604ff5e15593b003 2022-01-27 00:00:00+01:00 \n",
"4 e11943a6031a0e6114ae69c257617980 2022-01-27 00:00:00+01:00 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"campaigns = display_databases(\"8campaigns.csv\")\n",
"campaigns.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e7982be4-2c42-4a91-be5a-329a999644cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8campaign_stats.csv\n",
"Shape : (2527083, 8)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>campaign_id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>161410</td>\n",
" <td>2022-02-02 18:16:07+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2022-02-02 17:16:08.616899+01:00</td>\n",
" <td>2022-02-02 17:16:08.623098+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>54228</td>\n",
" <td>2022-02-02 18:18:11+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2022-02-02 17:18:12.030260+01:00</td>\n",
" <td>2022-02-02 17:18:12.036606+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>120794</td>\n",
" <td>2022-02-02 18:18:58+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2022-02-02 17:19:00.129697+01:00</td>\n",
" <td>2022-02-02 17:19:00.134704+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>467025</td>\n",
" <td>2022-02-02 18:19:33+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2022-02-02 17:19:34.023492+01:00</td>\n",
" <td>2022-02-02 17:19:34.027570+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>142106</td>\n",
" <td>2022-02-02 18:19:35+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2022-02-02 17:19:36.553321+01:00</td>\n",
" <td>2022-02-02 17:19:36.557473+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id campaign_id customer_id opened_at sent_at \\\n",
"0 1 5 161410 2022-02-02 18:16:07+01:00 NaN \n",
"1 2 1 54228 2022-02-02 18:18:11+01:00 NaN \n",
"2 3 6 120794 2022-02-02 18:18:58+01:00 NaN \n",
"3 4 3 467025 2022-02-02 18:19:33+01:00 NaN \n",
"4 5 2 142106 2022-02-02 18:19:35+01:00 NaN \n",
"\n",
" delivered_at created_at \\\n",
"0 NaN 2022-02-02 17:16:08.616899+01:00 \n",
"1 NaN 2022-02-02 17:18:12.030260+01:00 \n",
"2 NaN 2022-02-02 17:19:00.129697+01:00 \n",
"3 NaN 2022-02-02 17:19:34.023492+01:00 \n",
"4 NaN 2022-02-02 17:19:36.553321+01:00 \n",
"\n",
" updated_at \n",
"0 2022-02-02 17:16:08.623098+01:00 \n",
"1 2022-02-02 17:18:12.036606+01:00 \n",
"2 2022-02-02 17:19:00.134704+01:00 \n",
"3 2022-02-02 17:19:34.027570+01:00 \n",
"4 2022-02-02 17:19:36.557473+01:00 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
"campaign_stats.head()"
]
},
{
"cell_type": "markdown",
"id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9",
"metadata": {},
"source": [
"#### Look at links files"
]
},
{
"cell_type": "markdown",
"id": "28e7c1fe-470f-4d84-87b8-a711a973500b",
"metadata": {},
"source": [
"There is no links file for these company. Only the link_stats file"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8link_stats.csv\n",
"Shape : (108461, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>clicked_at</th>\n",
" <th>link_id</th>\n",
" <th>customer_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2022-02-02 18:33:17+01:00</td>\n",
" <td>1</td>\n",
" <td>62137</td>\n",
" <td>2022-02-02 17:33:19.237759+01:00</td>\n",
" <td>2022-02-02 17:33:19.237759+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2022-02-02 18:33:26+01:00</td>\n",
" <td>1</td>\n",
" <td>556048</td>\n",
" <td>2022-02-02 17:33:28.101943+01:00</td>\n",
" <td>2022-02-02 17:33:28.101943+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2022-02-02 18:33:49+01:00</td>\n",
" <td>2</td>\n",
" <td>194456</td>\n",
" <td>2022-02-02 17:33:50.595125+01:00</td>\n",
" <td>2022-02-02 17:33:50.595125+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>2022-02-02 18:34:19+01:00</td>\n",
" <td>1</td>\n",
" <td>194456</td>\n",
" <td>2022-02-02 17:34:20.493986+01:00</td>\n",
" <td>2022-02-02 17:34:20.493986+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>2022-02-02 18:34:21+01:00</td>\n",
" <td>2</td>\n",
" <td>21571</td>\n",
" <td>2022-02-02 17:34:22.300427+01:00</td>\n",
" <td>2022-02-02 17:34:22.300427+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id clicked_at link_id customer_id \\\n",
"0 1 2022-02-02 18:33:17+01:00 1 62137 \n",
"1 2 2022-02-02 18:33:26+01:00 1 556048 \n",
"2 3 2022-02-02 18:33:49+01:00 2 194456 \n",
"3 4 2022-02-02 18:34:19+01:00 1 194456 \n",
"4 5 2022-02-02 18:34:21+01:00 2 21571 \n",
"\n",
" created_at updated_at \n",
"0 2022-02-02 17:33:19.237759+01:00 2022-02-02 17:33:19.237759+01:00 \n",
"1 2022-02-02 17:33:28.101943+01:00 2022-02-02 17:33:28.101943+01:00 \n",
"2 2022-02-02 17:33:50.595125+01:00 2022-02-02 17:33:50.595125+01:00 \n",
"3 2022-02-02 17:34:20.493986+01:00 2022-02-02 17:34:20.493986+01:00 \n",
"4 2022-02-02 17:34:22.300427+01:00 2022-02-02 17:34:22.300427+01:00 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"links_stats = display_databases(\"8link_stats.csv\")\n",
"links_stats.head()"
]
},
{
"cell_type": "markdown",
"id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8",
"metadata": {},
"source": [
"#### Analyse Customersplus file"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3b523575-c779-451c-a12e-a36fb4ad232c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8customersplus.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2473/2210053343.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>...</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1411166</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email1411166</td>\n",
" <td>1</td>\n",
" <td>2022-12-19 15:03:39.419371+01:00</td>\n",
" <td>2022-12-19 15:03:39.419371+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478498</td>\n",
" <td>lastname478498</td>\n",
" <td>firstname478498</td>\n",
" <td>NaN</td>\n",
" <td>email478498</td>\n",
" <td>339167</td>\n",
" <td>2021-09-17 18:58:30.259053+02:00</td>\n",
" <td>2023-06-28 15:25:24.146689+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>473678</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email473678</td>\n",
" <td>339167</td>\n",
" <td>2021-09-17 18:44:04.119713+02:00</td>\n",
" <td>2021-09-17 18:44:04.124204+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>475026</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email475026</td>\n",
" <td>339167</td>\n",
" <td>2021-09-17 18:47:28.789618+02:00</td>\n",
" <td>2021-09-17 18:47:28.793958+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>487146</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email487146</td>\n",
" <td>339167</td>\n",
" <td>2021-09-17 19:10:24.070460+02:00</td>\n",
" <td>2021-09-17 19:10:24.076033+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1594</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 43 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"0 1411166 NaN NaN NaN email1411166 \n",
"1 478498 lastname478498 firstname478498 NaN email478498 \n",
"2 473678 NaN NaN NaN email473678 \n",
"3 475026 NaN NaN NaN email475026 \n",
"4 487146 NaN NaN NaN email487146 \n",
"\n",
" street_id created_at \\\n",
"0 1 2022-12-19 15:03:39.419371+01:00 \n",
"1 339167 2021-09-17 18:58:30.259053+02:00 \n",
"2 339167 2021-09-17 18:44:04.119713+02:00 \n",
"3 339167 2021-09-17 18:47:28.789618+02:00 \n",
"4 339167 2021-09-17 19:10:24.070460+02:00 \n",
"\n",
" updated_at civility is_partner ... \\\n",
"0 2022-12-19 15:03:39.419371+01:00 NaN False ... \n",
"1 2023-06-28 15:25:24.146689+02:00 NaN False ... \n",
"2 2021-09-17 18:44:04.124204+02:00 NaN False ... \n",
"3 2021-09-17 18:47:28.793958+02:00 NaN False ... \n",
"4 2021-09-17 19:10:24.076033+02:00 NaN False ... \n",
"\n",
" preferred_category preferred_supplier preferred_formula purchase_count \\\n",
"0 NaN NaN NaN 0 \n",
"1 NaN NaN NaN 0 \n",
"2 NaN NaN NaN 0 \n",
"3 NaN NaN NaN 0 \n",
"4 NaN NaN NaN 0 \n",
"\n",
" first_buying_date last_visiting_date zipcode country age tenant_id \n",
"0 NaN NaN NaN fr NaN 1594 \n",
"1 NaN NaN NaN NaN NaN 1594 \n",
"2 NaN NaN NaN NaN NaN 1594 \n",
"3 NaN NaN NaN NaN NaN 1594 \n",
"4 NaN NaN NaN NaN NaN 1594 \n",
"\n",
"[5 rows x 43 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_name = \"8customersplus.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"with fs.open(file_path, mode=\"rb\") as file_in:\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"customersplus.head()"
]
},
{
"cell_type": "markdown",
"id": "fe56785a-ed3c-4322-aafa-a630f97b836f",
"metadata": {},
"source": [
"#### Analyse Structures files"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8structures.csv\n",
"No structures database\n"
]
}
],
"source": [
"file_name = \"8structures.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" structures = pd.read_csv(file_in, sep=\",\")\n",
"except:\n",
" print(\"No structures database\")"
]
},
{
"cell_type": "markdown",
"id": "b8452558-2d32-459b-91e7-f6042345e465",
"metadata": {},
"source": [
"For Stade Français, there is no structures, tags and structure_tag_mapping databases"
]
},
{
"cell_type": "markdown",
"id": "285b1422-9ca9-4afd-b752-777a54aaa677",
"metadata": {},
"source": [
"#### Analyze Target databases"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8customer_target_mappings.csv\n",
"Shape : (1449147, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>460062</td>\n",
" <td>68</td>\n",
" <td>2021-09-17 20:20:24.562734+02:00</td>\n",
" <td>2021-09-17 20:20:24.562734+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>460056</td>\n",
" <td>68</td>\n",
" <td>2021-09-17 20:20:24.610139+02:00</td>\n",
" <td>2021-09-17 20:20:24.610139+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>460051</td>\n",
" <td>65</td>\n",
" <td>2021-09-17 20:20:24.641381+02:00</td>\n",
" <td>2021-09-17 20:20:24.641381+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>460051</td>\n",
" <td>66</td>\n",
" <td>2021-09-17 20:20:24.672238+02:00</td>\n",
" <td>2021-09-17 20:20:24.672238+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>460049</td>\n",
" <td>71</td>\n",
" <td>2021-09-17 20:20:24.703110+02:00</td>\n",
" <td>2021-09-17 20:20:24.703110+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id customer_id target_id created_at \\\n",
"0 1 460062 68 2021-09-17 20:20:24.562734+02:00 \n",
"1 2 460056 68 2021-09-17 20:20:24.610139+02:00 \n",
"2 3 460051 65 2021-09-17 20:20:24.641381+02:00 \n",
"3 4 460051 66 2021-09-17 20:20:24.672238+02:00 \n",
"4 5 460049 71 2021-09-17 20:20:24.703110+02:00 \n",
"\n",
" updated_at name extra_field \n",
"0 2021-09-17 20:20:24.562734+02:00 NaN NaN \n",
"1 2021-09-17 20:20:24.610139+02:00 NaN NaN \n",
"2 2021-09-17 20:20:24.641381+02:00 NaN NaN \n",
"3 2021-09-17 20:20:24.672238+02:00 NaN NaN \n",
"4 2021-09-17 20:20:24.703110+02:00 NaN NaN "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_name = \"8customer_target_mappings.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" customer_targets = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", customer_targets.shape)\n",
"customer_targets.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8targets.csv\n",
"Shape : (331, 5)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>target_type_id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>ÉTUDIANTS (OPÉ PANIERS) 21-22</td>\n",
" <td>2021-09-17 18:10:40.879995+02:00</td>\n",
" <td>2021-09-17 18:10:40.879995+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>EFFECTIF + STAFF 21-22</td>\n",
" <td>2021-09-17 18:10:40.894758+02:00</td>\n",
" <td>2021-09-17 18:10:40.894758+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Acheteurs LOU / USAP</td>\n",
" <td>2021-09-17 18:10:40.911969+02:00</td>\n",
" <td>2021-09-17 18:10:40.911969+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>Liste Compensation 21-22</td>\n",
" <td>2021-09-17 18:10:40.928796+02:00</td>\n",
" <td>2021-09-17 18:10:40.928796+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>Partenaires 21-22</td>\n",
" <td>2021-09-17 18:10:40.945476+02:00</td>\n",
" <td>2021-09-17 18:10:40.945476+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id target_type_id name \\\n",
"0 1 1 ÉTUDIANTS (OPÉ PANIERS) 21-22 \n",
"1 2 1 EFFECTIF + STAFF 21-22 \n",
"2 3 1 Acheteurs LOU / USAP \n",
"3 4 1 Liste Compensation 21-22 \n",
"4 5 1 Partenaires 21-22 \n",
"\n",
" created_at updated_at \n",
"0 2021-09-17 18:10:40.879995+02:00 2021-09-17 18:10:40.879995+02:00 \n",
"1 2021-09-17 18:10:40.894758+02:00 2021-09-17 18:10:40.894758+02:00 \n",
"2 2021-09-17 18:10:40.911969+02:00 2021-09-17 18:10:40.911969+02:00 \n",
"3 2021-09-17 18:10:40.928796+02:00 2021-09-17 18:10:40.928796+02:00 \n",
"4 2021-09-17 18:10:40.945476+02:00 2021-09-17 18:10:40.945476+02:00 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_name = \"8targets.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" targets = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", targets.shape)\n",
"targets.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "85696d74-3b2f-4368-9045-44db5322b60d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8target_types.csv\n",
"Shape : (4, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_import</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2021-09-17 18:10:40.864320+02:00</td>\n",
" <td>2021-09-17 18:10:40.864320+02:00</td>\n",
" <td>e34e3aa838a6eb4c41df6ed4444b796a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" <td>2022-03-09 14:41:45.695407+01:00</td>\n",
" <td>2022-03-09 14:41:45.695407+01:00</td>\n",
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2022-04-01 17:02:49.588910+02:00</td>\n",
" <td>2022-04-01 17:02:49.588910+02:00</td>\n",
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>True</td>\n",
" <td>manual_import</td>\n",
" <td>2022-05-06 14:26:01.923160+02:00</td>\n",
" <td>2022-05-06 14:26:01.923160+02:00</td>\n",
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_import name created_at \\\n",
"0 1 NaN manual_static_filter 2021-09-17 18:10:40.864320+02:00 \n",
"1 2 False manual_dynamic_filter 2022-03-09 14:41:45.695407+01:00 \n",
"2 3 False manual_static_filter 2022-04-01 17:02:49.588910+02:00 \n",
"3 4 True manual_import 2022-05-06 14:26:01.923160+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-09-17 18:10:40.864320+02:00 e34e3aa838a6eb4c41df6ed4444b796a \n",
"1 2022-03-09 14:41:45.695407+01:00 e0f4b8693184850fefd6d2a38f10584e \n",
"2 2022-04-01 17:02:49.588910+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
"3 2022-05-06 14:26:01.923160+02:00 12213df2ce68a624e4c0070521437bac "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_name = \"8target_types.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", target_types.shape)\n",
"target_types.head()"
]
},
{
"cell_type": "markdown",
"id": "cdc6416b-3deb-446c-8957-435745b93533",
"metadata": {},
"source": [
"#### Analyze consumption files"
]
},
{
"cell_type": "markdown",
"id": "f8622bd5-a5ab-403f-ab01-758aec879ee4",
"metadata": {},
"source": [
"Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n",
"\n",
"However, there is no consumptions.csv file"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8purchases.csv\n",
"Shape : (975703, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>number</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>119609</td>\n",
" <td>2017-09-09 15:39:45.913000+02:00</td>\n",
" <td>1149</td>\n",
" <td>2021-06-29 21:52:21.816195+02:00</td>\n",
" <td>2021-06-29 21:52:21.816195+02:00</td>\n",
" <td>193416</td>\n",
" <td>f2956e2d53321317e7c15c1cb992156c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>119610</td>\n",
" <td>2017-09-09 15:39:46.033000+02:00</td>\n",
" <td>1149</td>\n",
" <td>2021-06-29 21:52:21.817846+02:00</td>\n",
" <td>2021-06-29 21:52:21.817846+02:00</td>\n",
" <td>193416</td>\n",
" <td>faabab441b2668a85bb484490b2166c3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5464</td>\n",
" <td>2017-07-24 19:44:11.923000+02:00</td>\n",
" <td>1251</td>\n",
" <td>2021-06-29 21:33:45.604224+02:00</td>\n",
" <td>2021-06-29 21:33:45.604224+02:00</td>\n",
" <td>184354</td>\n",
" <td>f63c69fa585ce4f91681f0d9ebeb770f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>119613</td>\n",
" <td>2017-09-10 11:25:45.820000+02:00</td>\n",
" <td>12558</td>\n",
" <td>2021-06-29 21:52:21.822033+02:00</td>\n",
" <td>2021-06-29 21:52:21.822033+02:00</td>\n",
" <td>193462</td>\n",
" <td>ffce5fd8d2348eb6885d0ee9c7bd017c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1422860</td>\n",
" <td>2018-10-08 10:30:42.980000+02:00</td>\n",
" <td>17935</td>\n",
" <td>2021-07-16 04:20:55.347369+02:00</td>\n",
" <td>2021-07-16 04:20:55.347369+02:00</td>\n",
" <td>247459</td>\n",
" <td>193e41eae8ee078537107a569c0426ef</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id purchase_date customer_id \\\n",
"0 119609 2017-09-09 15:39:45.913000+02:00 1149 \n",
"1 119610 2017-09-09 15:39:46.033000+02:00 1149 \n",
"2 5464 2017-07-24 19:44:11.923000+02:00 1251 \n",
"3 119613 2017-09-10 11:25:45.820000+02:00 12558 \n",
"4 1422860 2018-10-08 10:30:42.980000+02:00 17935 \n",
"\n",
" created_at updated_at number \\\n",
"0 2021-06-29 21:52:21.816195+02:00 2021-06-29 21:52:21.816195+02:00 193416 \n",
"1 2021-06-29 21:52:21.817846+02:00 2021-06-29 21:52:21.817846+02:00 193416 \n",
"2 2021-06-29 21:33:45.604224+02:00 2021-06-29 21:33:45.604224+02:00 184354 \n",
"3 2021-06-29 21:52:21.822033+02:00 2021-06-29 21:52:21.822033+02:00 193462 \n",
"4 2021-07-16 04:20:55.347369+02:00 2021-07-16 04:20:55.347369+02:00 247459 \n",
"\n",
" identifier \n",
"0 f2956e2d53321317e7c15c1cb992156c \n",
"1 faabab441b2668a85bb484490b2166c3 \n",
"2 f63c69fa585ce4f91681f0d9ebeb770f \n",
"3 ffce5fd8d2348eb6885d0ee9c7bd017c \n",
"4 193e41eae8ee078537107a569c0426ef "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"purchases = display_databases(\"8purchases.csv\")\n",
"purchases.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8tickets.csv\n",
"Shape : (2370152, 11)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>number</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>purchase_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>type_of</th>\n",
" <th>supplier_id</th>\n",
" <th>barcode</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>254164</td>\n",
" <td>193416_763837_650_688_326212</td>\n",
" <td>2021-06-29 21:53:14.951871+02:00</td>\n",
" <td>2021-06-29 21:53:14.951871+02:00</td>\n",
" <td>119609</td>\n",
" <td>3334</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>9ec3b5617fc54512acf131aa5fa26870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>254165</td>\n",
" <td>193416_763838_650_688_326236</td>\n",
" <td>2021-06-29 21:53:14.953717+02:00</td>\n",
" <td>2021-06-29 21:53:14.953717+02:00</td>\n",
" <td>119610</td>\n",
" <td>3334</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>b227c664e2574a919672683f5cc4c98e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>254168</td>\n",
" <td>193462_763921_649_687_305676</td>\n",
" <td>2021-06-29 21:53:14.958207+02:00</td>\n",
" <td>2021-06-29 21:53:14.958207+02:00</td>\n",
" <td>119613</td>\n",
" <td>3432</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>28ac507ad84a30993bdfc0996fd2476b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>254169</td>\n",
" <td>193462_763922_649_687_305653</td>\n",
" <td>2021-06-29 21:53:14.959681+02:00</td>\n",
" <td>2021-06-29 21:53:14.959681+02:00</td>\n",
" <td>119614</td>\n",
" <td>3268</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>131dbaeef23f5ac2271bf0266ce35476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>254170</td>\n",
" <td>193462_763923_649_687_305630</td>\n",
" <td>2021-06-29 21:53:14.961157+02:00</td>\n",
" <td>2021-06-29 21:53:14.961157+02:00</td>\n",
" <td>119615</td>\n",
" <td>3268</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>1a6342ad2c213b626aa55e5374cd661a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id number created_at \\\n",
"0 254164 193416_763837_650_688_326212 2021-06-29 21:53:14.951871+02:00 \n",
"1 254165 193416_763838_650_688_326236 2021-06-29 21:53:14.953717+02:00 \n",
"2 254168 193462_763921_649_687_305676 2021-06-29 21:53:14.958207+02:00 \n",
"3 254169 193462_763922_649_687_305653 2021-06-29 21:53:14.959681+02:00 \n",
"4 254170 193462_763923_649_687_305630 2021-06-29 21:53:14.961157+02:00 \n",
"\n",
" updated_at purchase_id product_id \\\n",
"0 2021-06-29 21:53:14.951871+02:00 119609 3334 \n",
"1 2021-06-29 21:53:14.953717+02:00 119610 3334 \n",
"2 2021-06-29 21:53:14.958207+02:00 119613 3432 \n",
"3 2021-06-29 21:53:14.959681+02:00 119614 3268 \n",
"4 2021-06-29 21:53:14.961157+02:00 119615 3268 \n",
"\n",
" is_from_subscription type_of supplier_id barcode \\\n",
"0 False 1 2 NaN \n",
"1 False 1 2 NaN \n",
"2 False 1 2 NaN \n",
"3 False 1 2 NaN \n",
"4 False 1 2 NaN \n",
"\n",
" identifier \n",
"0 9ec3b5617fc54512acf131aa5fa26870 \n",
"1 b227c664e2574a919672683f5cc4c98e \n",
"2 28ac507ad84a30993bdfc0996fd2476b \n",
"3 131dbaeef23f5ac2271bf0266ce35476 \n",
"4 1a6342ad2c213b626aa55e5374cd661a "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickets = display_databases(\"8tickets.csv\")\n",
"tickets.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "243e6942-0233-4cd5-b32b-e005457131d2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8suppliers.csv\n",
"Shape : (16, 9)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>manually_added</th>\n",
" <th>label</th>\n",
" <th>itr</th>\n",
" <th>updated_at</th>\n",
" <th>created_at</th>\n",
" <th>commission</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>152</td>\n",
" <td>plateformeceweb</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-07-16 00:02:17.805193+02:00</td>\n",
" <td>2021-07-16 00:02:17.805193+02:00</td>\n",
" <td>NaN</td>\n",
" <td>0fc934f49bfa9f1f4e6ab7e2593b6839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>accreditation annuelle</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-06-29 21:33:14.138349+02:00</td>\n",
" <td>2021-06-29 21:33:14.138349+02:00</td>\n",
" <td>NaN</td>\n",
" <td>fe13238540e0ff293ec8aad29aeae6c3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>68</td>\n",
" <td>abonnement parking</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-06-29 22:10:31.167367+02:00</td>\n",
" <td>2021-06-29 22:10:31.167367+02:00</td>\n",
" <td>NaN</td>\n",
" <td>0f7defc52a97cdca533af74f4e6e5b1e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9</td>\n",
" <td>accreditation match</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-06-29 21:33:14.142084+02:00</td>\n",
" <td>2021-06-29 21:33:14.142084+02:00</td>\n",
" <td>NaN</td>\n",
" <td>40e19a7c4824eaad298e0107ed7e3691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>154</td>\n",
" <td>web lnr-lou</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-07-16 00:02:17.806521+02:00</td>\n",
" <td>2021-07-16 00:02:17.806521+02:00</td>\n",
" <td>NaN</td>\n",
" <td>b144dd617807b02e0d9002fac6c61768</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name manually_added label itr \\\n",
"0 152 plateformeceweb False NaN NaN \n",
"1 6 accreditation annuelle False NaN NaN \n",
"2 68 abonnement parking False NaN NaN \n",
"3 9 accreditation match False NaN NaN \n",
"4 154 web lnr-lou False NaN NaN \n",
"\n",
" updated_at created_at \\\n",
"0 2021-07-16 00:02:17.805193+02:00 2021-07-16 00:02:17.805193+02:00 \n",
"1 2021-06-29 21:33:14.138349+02:00 2021-06-29 21:33:14.138349+02:00 \n",
"2 2021-06-29 22:10:31.167367+02:00 2021-06-29 22:10:31.167367+02:00 \n",
"3 2021-06-29 21:33:14.142084+02:00 2021-06-29 21:33:14.142084+02:00 \n",
"4 2021-07-16 00:02:17.806521+02:00 2021-07-16 00:02:17.806521+02:00 \n",
"\n",
" commission identifier \n",
"0 NaN 0fc934f49bfa9f1f4e6ab7e2593b6839 \n",
"1 NaN fe13238540e0ff293ec8aad29aeae6c3 \n",
"2 NaN 0f7defc52a97cdca533af74f4e6e5b1e \n",
"3 NaN 40e19a7c4824eaad298e0107ed7e3691 \n",
"4 NaN b144dd617807b02e0d9002fac6c61768 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"suppliers = display_databases(\"8suppliers.csv\")\n",
"suppliers.head()"
]
},
{
"cell_type": "markdown",
"id": "fd8c876a-f0c5-4123-a422-c267af5f29b1",
"metadata": {},
"source": [
"#### Analyse product file"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8products.csv\n",
"Shape : (45411, 14)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>category_id</th>\n",
" <th>apply_price</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>extra_field</th>\n",
" <th>amount_consumption</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>90013</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>1961</td>\n",
" <td>912</td>\n",
" <td>2021-07-16 04:56:05.797551+02:00</td>\n",
" <td>2021-07-16 04:56:05.797551+02:00</td>\n",
" <td>34</td>\n",
" <td>0.0</td>\n",
" <td>87917</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>476e111175b1660688b7c13dade2b57e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>662</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>11</td>\n",
" <td>29</td>\n",
" <td>2021-06-29 21:33:17.389201+02:00</td>\n",
" <td>2021-06-29 21:33:17.389201+02:00</td>\n",
" <td>16</td>\n",
" <td>0.0</td>\n",
" <td>640</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2c765698e9bedd48e8a3fd27dc8dbc97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>646</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>46</td>\n",
" <td>10</td>\n",
" <td>2021-06-29 21:33:17.366742+02:00</td>\n",
" <td>2021-06-29 21:33:17.366742+02:00</td>\n",
" <td>15</td>\n",
" <td>0.0</td>\n",
" <td>624</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4e719148651fd7f175e3fb51bdb5d31b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5703</td>\n",
" <td>5.0</td>\n",
" <td>False</td>\n",
" <td>7</td>\n",
" <td>188</td>\n",
" <td>2021-06-29 21:52:09.374365+02:00</td>\n",
" <td>2021-06-29 21:52:09.374365+02:00</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>5540</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>e4d7beeb0a631e2e51e61951623ba9b1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>648</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>49</td>\n",
" <td>10</td>\n",
" <td>2021-06-29 21:33:17.369471+02:00</td>\n",
" <td>2021-06-29 21:33:17.369471+02:00</td>\n",
" <td>15</td>\n",
" <td>0.0</td>\n",
" <td>626</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>07a5dd9e125345b9458651ab73605255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id amount is_full_price representation_id pricing_formula_id \\\n",
"0 90013 0.0 False 1961 912 \n",
"1 662 0.0 False 11 29 \n",
"2 646 0.0 False 46 10 \n",
"3 5703 5.0 False 7 188 \n",
"4 648 0.0 False 49 10 \n",
"\n",
" created_at updated_at \\\n",
"0 2021-07-16 04:56:05.797551+02:00 2021-07-16 04:56:05.797551+02:00 \n",
"1 2021-06-29 21:33:17.389201+02:00 2021-06-29 21:33:17.389201+02:00 \n",
"2 2021-06-29 21:33:17.366742+02:00 2021-06-29 21:33:17.366742+02:00 \n",
"3 2021-06-29 21:52:09.374365+02:00 2021-06-29 21:52:09.374365+02:00 \n",
"4 2021-06-29 21:33:17.369471+02:00 2021-06-29 21:33:17.369471+02:00 \n",
"\n",
" category_id apply_price products_group_id product_pack_id extra_field \\\n",
"0 34 0.0 87917 1 NaN \n",
"1 16 0.0 640 1 NaN \n",
"2 15 0.0 624 1 NaN \n",
"3 4 0.0 5540 1 NaN \n",
"4 15 0.0 626 1 NaN \n",
"\n",
" amount_consumption identifier \n",
"0 NaN 476e111175b1660688b7c13dade2b57e \n",
"1 NaN 2c765698e9bedd48e8a3fd27dc8dbc97 \n",
"2 NaN 4e719148651fd7f175e3fb51bdb5d31b \n",
"3 NaN e4d7beeb0a631e2e51e61951623ba9b1 \n",
"4 NaN 07a5dd9e125345b9458651ab73605255 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products = display_databases(\"8products.csv\")\n",
"products.head()"
]
},
{
"cell_type": "markdown",
"id": "8ad143b2-2869-4bd2-982e-688498b98727",
"metadata": {},
"source": [
"#### Analyze pricing files"
]
},
{
"cell_type": "markdown",
"id": "9a54e9a5-801d-4000-9e76-e792edbf7e41",
"metadata": {},
"source": [
"Meaning pricing_formulas.csv and type_of_pricing_formulas"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8pricing_formulas.csv\n",
"Shape : (516, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>extra_field</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>visite stade enfant</td>\n",
" <td>2021-06-29 21:33:14.160728+02:00</td>\n",
" <td>2021-06-29 21:33:14.160728+02:00</td>\n",
" <td>NaN</td>\n",
" <td>bbc80e5761a0ea325f6f6a5411752659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3229</td>\n",
" <td>tarif bloc etudiants</td>\n",
" <td>2021-07-16 04:20:46.684601+02:00</td>\n",
" <td>2021-09-03 16:44:46.096785+02:00</td>\n",
" <td>NaN</td>\n",
" <td>205122cc7e96d559330972b0ec0cf35a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>42</td>\n",
" <td>invitation eiffage</td>\n",
" <td>2021-06-29 21:33:14.204483+02:00</td>\n",
" <td>2021-06-29 21:33:14.204483+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e4e6365c02e2a7b01ebe2ce8ace624f2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4379</td>\n",
" <td>invitation offre speciale</td>\n",
" <td>2021-07-16 05:21:44.984893+02:00</td>\n",
" <td>2021-07-16 05:21:44.984893+02:00</td>\n",
" <td>NaN</td>\n",
" <td>307817b6205535a35915a64027ee161e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2641</td>\n",
" <td>prevente reabo enfant</td>\n",
" <td>2021-07-16 03:47:40.896805+02:00</td>\n",
" <td>2021-09-03 16:08:35.304298+02:00</td>\n",
" <td>NaN</td>\n",
" <td>478eb63c71ba35d8d3d64c8637dafdee</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 7 visite stade enfant 2021-06-29 21:33:14.160728+02:00 \n",
"1 3229 tarif bloc etudiants 2021-07-16 04:20:46.684601+02:00 \n",
"2 42 invitation eiffage 2021-06-29 21:33:14.204483+02:00 \n",
"3 4379 invitation offre speciale 2021-07-16 05:21:44.984893+02:00 \n",
"4 2641 prevente reabo enfant 2021-07-16 03:47:40.896805+02:00 \n",
"\n",
" updated_at extra_field \\\n",
"0 2021-06-29 21:33:14.160728+02:00 NaN \n",
"1 2021-09-03 16:44:46.096785+02:00 NaN \n",
"2 2021-06-29 21:33:14.204483+02:00 NaN \n",
"3 2021-07-16 05:21:44.984893+02:00 NaN \n",
"4 2021-09-03 16:08:35.304298+02:00 NaN \n",
"\n",
" identifier \n",
"0 bbc80e5761a0ea325f6f6a5411752659 \n",
"1 205122cc7e96d559330972b0ec0cf35a \n",
"2 e4e6365c02e2a7b01ebe2ce8ace624f2 \n",
"3 307817b6205535a35915a64027ee161e \n",
"4 478eb63c71ba35d8d3d64c8637dafdee "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
"pricing_formulas.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8type_of_pricing_formulas.csv\n",
"Shape : (103, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>type_of_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>1021</td>\n",
" <td>2021-09-03 14:17:19.816110+02:00</td>\n",
" <td>2021-09-03 14:17:19.816110+02:00</td>\n",
" <td>41047fbeb7cd3e1cb2713c608d2f786d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>4305</td>\n",
" <td>2021-09-03 14:17:19.848088+02:00</td>\n",
" <td>2021-09-03 14:17:19.848088+02:00</td>\n",
" <td>a62a4dad7d62738129244bbb5ede0747</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>4306</td>\n",
" <td>2021-09-03 14:17:19.864067+02:00</td>\n",
" <td>2021-09-03 14:17:19.864067+02:00</td>\n",
" <td>c3770373e09f55412068c447736d9da3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>29</td>\n",
" <td>2021-09-03 14:17:19.880078+02:00</td>\n",
" <td>2021-09-03 14:17:19.880078+02:00</td>\n",
" <td>7b7b1242ae7a8c9eb66d35d8a4348ccd</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>10</td>\n",
" <td>2021-09-03 14:18:03.616081+02:00</td>\n",
" <td>2021-09-03 14:18:03.616081+02:00</td>\n",
" <td>0a2b941c46b31258c03b316aa064e86a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id type_of_id pricing_formula_id created_at \\\n",
"0 1 7 1021 2021-09-03 14:17:19.816110+02:00 \n",
"1 2 7 4305 2021-09-03 14:17:19.848088+02:00 \n",
"2 3 7 4306 2021-09-03 14:17:19.864067+02:00 \n",
"3 4 7 29 2021-09-03 14:17:19.880078+02:00 \n",
"4 5 8 10 2021-09-03 14:18:03.616081+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-09-03 14:17:19.816110+02:00 41047fbeb7cd3e1cb2713c608d2f786d \n",
"1 2021-09-03 14:17:19.848088+02:00 a62a4dad7d62738129244bbb5ede0747 \n",
"2 2021-09-03 14:17:19.864067+02:00 c3770373e09f55412068c447736d9da3 \n",
"3 2021-09-03 14:17:19.880078+02:00 7b7b1242ae7a8c9eb66d35d8a4348ccd \n",
"4 2021-09-03 14:18:03.616081+02:00 0a2b941c46b31258c03b316aa064e86a "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
"type_pricing_formulas.head()"
]
},
{
"cell_type": "markdown",
"id": "a084297a-4fd7-4cda-b513-7704f4244a5c",
"metadata": {},
"source": [
"#### Analyze type of products"
]
},
{
"cell_type": "markdown",
"id": "76a67ea7-8720-441e-8973-23e5d105370e",
"metadata": {},
"source": [
"Meaning categories.csv, type_of_categories.csv"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "6582694d-5339-4f33-a943-c73033121a90",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8categories.csv\n",
"Shape : (148, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>extra_field</th>\n",
" <th>quota</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>653</td>\n",
" <td>acces village implid</td>\n",
" <td>2021-07-16 00:04:37.181331+02:00</td>\n",
" <td>2021-07-16 00:04:37.181331+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>c447d053646a6503d3cd84d4798bf5b7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>805</td>\n",
" <td>parking organisation</td>\n",
" <td>2021-07-16 01:54:15.822407+02:00</td>\n",
" <td>2021-07-16 01:54:15.822407+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>02bf9871964345f505ad305080daec36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>809</td>\n",
" <td>rose rouge orange</td>\n",
" <td>2021-07-16 01:54:15.825345+02:00</td>\n",
" <td>2021-07-16 01:54:15.825345+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>31fb5b57bc1a2bcd5c155fb0d9e7c0dd</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2183</td>\n",
" <td>2eme catégorie j.b. centrale</td>\n",
" <td>2021-07-16 04:37:25.446835+02:00</td>\n",
" <td>2021-07-16 04:37:25.446835+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>c9eb6651caaed42b809b3f4407a847c9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>621</td>\n",
" <td>acces brasserie</td>\n",
" <td>2021-07-16 00:02:17.249701+02:00</td>\n",
" <td>2021-07-16 00:02:17.249701+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>349e6a59585d78d80d46acbc6a520c50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 653 acces village implid 2021-07-16 00:04:37.181331+02:00 \n",
"1 805 parking organisation 2021-07-16 01:54:15.822407+02:00 \n",
"2 809 rose rouge orange 2021-07-16 01:54:15.825345+02:00 \n",
"3 2183 2eme catégorie j.b. centrale 2021-07-16 04:37:25.446835+02:00 \n",
"4 621 acces brasserie 2021-07-16 00:02:17.249701+02:00 \n",
"\n",
" updated_at extra_field quota \\\n",
"0 2021-07-16 00:04:37.181331+02:00 NaN NaN \n",
"1 2021-07-16 01:54:15.822407+02:00 NaN NaN \n",
"2 2021-07-16 01:54:15.825345+02:00 NaN NaN \n",
"3 2021-07-16 04:37:25.446835+02:00 NaN NaN \n",
"4 2021-07-16 00:02:17.249701+02:00 NaN NaN \n",
"\n",
" identifier \n",
"0 c447d053646a6503d3cd84d4798bf5b7 \n",
"1 02bf9871964345f505ad305080daec36 \n",
"2 31fb5b57bc1a2bcd5c155fb0d9e7c0dd \n",
"3 c9eb6651caaed42b809b3f4407a847c9 \n",
"4 349e6a59585d78d80d46acbc6a520c50 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categories = display_databases(\"8categories.csv\")\n",
"categories.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "589076df-1958-42de-9941-1aff9fa8536f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8type_of_categories.csv\n",
"Shape : (6, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>type_of_id</th>\n",
" <th>category_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2021-08-20 15:22:05.558209+02:00</td>\n",
" <td>2021-08-20 15:22:05.558209+02:00</td>\n",
" <td>af8fa6d57f6b19a7600a69e7771c7c3a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2021-09-02 17:29:32.582002+02:00</td>\n",
" <td>2021-09-02 17:29:32.582002+02:00</td>\n",
" <td>63718e7ad306912427758ddf988ad34f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2021-09-02 17:32:38.299733+02:00</td>\n",
" <td>2021-09-02 17:32:38.299733+02:00</td>\n",
" <td>5e147d4d90888df14c4584f5c6887c96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2021-09-02 17:35:04.748993+02:00</td>\n",
" <td>2021-09-02 17:35:04.748993+02:00</td>\n",
" <td>a9dfdc3f40b41e3018933c6167fc38a5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>17</td>\n",
" <td>2021-09-02 17:35:37.396740+02:00</td>\n",
" <td>2021-09-02 17:35:37.396740+02:00</td>\n",
" <td>c05b0061d2a875adbc35d3dfa6a50a12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id type_of_id category_id created_at \\\n",
"0 1 1 2 2021-08-20 15:22:05.558209+02:00 \n",
"1 2 2 1 2021-09-02 17:29:32.582002+02:00 \n",
"2 3 3 3 2021-09-02 17:32:38.299733+02:00 \n",
"3 4 4 4 2021-09-02 17:35:04.748993+02:00 \n",
"4 5 5 17 2021-09-02 17:35:37.396740+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-08-20 15:22:05.558209+02:00 af8fa6d57f6b19a7600a69e7771c7c3a \n",
"1 2021-09-02 17:29:32.582002+02:00 63718e7ad306912427758ddf988ad34f \n",
"2 2021-09-02 17:32:38.299733+02:00 5e147d4d90888df14c4584f5c6887c96 \n",
"3 2021-09-02 17:35:04.748993+02:00 a9dfdc3f40b41e3018933c6167fc38a5 \n",
"4 2021-09-02 17:35:37.396740+02:00 c05b0061d2a875adbc35d3dfa6a50a12 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type_categories = display_databases(\"8type_of_categories.csv\")\n",
"type_categories.head()"
]
},
{
"cell_type": "markdown",
"id": "3427b681-4c05-4e4e-9c2b-867ee789f98c",
"metadata": {},
"source": [
"#### Analyze type of representations"
]
},
{
"cell_type": "markdown",
"id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e",
"metadata": {},
"source": [
"Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n",
"\n",
"however there is no representation_types database"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8representation_category_capacities.csv\n",
"Shape : (7378, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>representation_id</th>\n",
" <th>category_id</th>\n",
" <th>expected_filling</th>\n",
" <th>max_filling</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>561</td>\n",
" <td>2021-06-29 21:33:14.096827+02:00</td>\n",
" <td>2021-06-29 21:33:14.096827+02:00</td>\n",
" <td>17</td>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>571</td>\n",
" <td>2021-06-29 21:33:14.110047+02:00</td>\n",
" <td>2021-06-29 21:33:14.110047+02:00</td>\n",
" <td>14</td>\n",
" <td>39</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>9665</td>\n",
" <td>2021-07-16 00:02:17.736387+02:00</td>\n",
" <td>2021-07-16 00:02:17.736387+02:00</td>\n",
" <td>1887</td>\n",
" <td>8</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>383906</td>\n",
" <td>2023-03-04 02:55:01.585418+01:00</td>\n",
" <td>2023-03-04 02:55:01.585418+01:00</td>\n",
" <td>52729</td>\n",
" <td>476</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>393</td>\n",
" <td>2021-06-29 21:33:13.876766+02:00</td>\n",
" <td>2021-06-29 21:33:13.876766+02:00</td>\n",
" <td>9</td>\n",
" <td>23</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id created_at updated_at \\\n",
"0 561 2021-06-29 21:33:14.096827+02:00 2021-06-29 21:33:14.096827+02:00 \n",
"1 571 2021-06-29 21:33:14.110047+02:00 2021-06-29 21:33:14.110047+02:00 \n",
"2 9665 2021-07-16 00:02:17.736387+02:00 2021-07-16 00:02:17.736387+02:00 \n",
"3 383906 2023-03-04 02:55:01.585418+01:00 2023-03-04 02:55:01.585418+01:00 \n",
"4 393 2021-06-29 21:33:13.876766+02:00 2021-06-29 21:33:13.876766+02:00 \n",
"\n",
" representation_id category_id expected_filling max_filling \n",
"0 17 37 NaN NaN \n",
"1 14 39 NaN NaN \n",
"2 1887 8 NaN NaN \n",
"3 52729 476 NaN NaN \n",
"4 9 23 NaN NaN "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
"representation_category_capacities.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "bd405913-033d-4f15-a5b9-103d577baaff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8representations.csv\n",
"Shape : (1015, 16)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>serial</th>\n",
" <th>event_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>start_date_time</th>\n",
" <th>open</th>\n",
" <th>satisfaction</th>\n",
" <th>end_date_time</th>\n",
" <th>name</th>\n",
" <th>is_display</th>\n",
" <th>representation_type_id</th>\n",
" <th>expected_filling</th>\n",
" <th>max_filling</th>\n",
" <th>extra_field</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5903</td>\n",
" <td>NaN</td>\n",
" <td>5836</td>\n",
" <td>2021-07-16 05:16:57.419565+02:00</td>\n",
" <td>2021-07-16 05:16:57.419565+02:00</td>\n",
" <td>2019-08-24 18:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8009c34cae4e79e3781f16f3ceeab244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>67133</td>\n",
" <td>NaN</td>\n",
" <td>65652</td>\n",
" <td>2023-09-27 02:21:36.573001+02:00</td>\n",
" <td>2023-09-27 02:21:36.573001+02:00</td>\n",
" <td>2023-10-04 10:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4e9d3fc8d1f7bf563dc586548fe6390e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1874</td>\n",
" <td>NaN</td>\n",
" <td>1826</td>\n",
" <td>2021-07-16 00:02:17.390274+02:00</td>\n",
" <td>2021-07-16 00:02:17.390274+02:00</td>\n",
" <td>2019-09-14 18:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>19f666370c1fc781dff638c20ae04c8a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5904</td>\n",
" <td>NaN</td>\n",
" <td>5837</td>\n",
" <td>2021-07-16 05:16:57.420302+02:00</td>\n",
" <td>2021-07-16 05:16:57.420302+02:00</td>\n",
" <td>2019-09-01 17:05:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4221acd3f49179f5d0b292c15d1ab8e4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4165</td>\n",
" <td>NaN</td>\n",
" <td>4106</td>\n",
" <td>2021-07-16 03:53:05.929713+02:00</td>\n",
" <td>2021-07-16 03:53:05.929713+02:00</td>\n",
" <td>2018-10-14 14:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>733104286519c0614b2d45470eb180a1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id serial event_id created_at \\\n",
"0 5903 NaN 5836 2021-07-16 05:16:57.419565+02:00 \n",
"1 67133 NaN 65652 2023-09-27 02:21:36.573001+02:00 \n",
"2 1874 NaN 1826 2021-07-16 00:02:17.390274+02:00 \n",
"3 5904 NaN 5837 2021-07-16 05:16:57.420302+02:00 \n",
"4 4165 NaN 4106 2021-07-16 03:53:05.929713+02:00 \n",
"\n",
" updated_at start_date_time open \\\n",
"0 2021-07-16 05:16:57.419565+02:00 2019-08-24 18:00:00+02:00 True \n",
"1 2023-09-27 02:21:36.573001+02:00 2023-10-04 10:30:00+02:00 True \n",
"2 2021-07-16 00:02:17.390274+02:00 2019-09-14 18:00:00+02:00 True \n",
"3 2021-07-16 05:16:57.420302+02:00 2019-09-01 17:05:00+02:00 True \n",
"4 2021-07-16 03:53:05.929713+02:00 2018-10-14 14:00:00+02:00 True \n",
"\n",
" satisfaction end_date_time name is_display \\\n",
"0 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"1 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"2 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"3 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"4 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"\n",
" representation_type_id expected_filling max_filling extra_field \\\n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"\n",
" identifier \n",
"0 8009c34cae4e79e3781f16f3ceeab244 \n",
"1 4e9d3fc8d1f7bf563dc586548fe6390e \n",
"2 19f666370c1fc781dff638c20ae04c8a \n",
"3 4221acd3f49179f5d0b292c15d1ab8e4 \n",
"4 733104286519c0614b2d45470eb180a1 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"representations = display_databases(\"8representations.csv\")\n",
"representations.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
"metadata": {},
"outputs": [],
"source": [
"#representation_type = display_databases(\"8representation_types.csv\")"
]
},
{
"cell_type": "markdown",
"id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7",
"metadata": {},
"source": [
"#### Analyze type of events"
]
},
{
"cell_type": "markdown",
"id": "1d554266-282c-4f64-9a0f-ddcf591ec912",
"metadata": {},
"source": [
"Meaning events.csv, event_types.csv, seasons.csv and facilities.csv"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8events.csv\n",
"Shape : (922, 12)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>name</th>\n",
" <th>event_type_id</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41542</td>\n",
" <td>2022-10-29 02:54:32.756920+02:00</td>\n",
" <td>2022-10-29 02:57:35.511792+02:00</td>\n",
" <td>52</td>\n",
" <td>1</td>\n",
" <td>match lou feminin - lons</td>\n",
" <td>5588</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>5588</td>\n",
" <td>1</td>\n",
" <td>40cc5a346b1af4ee7108ac28b144fb77</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>21068</td>\n",
" <td>2021-12-17 03:43:53.166446+01:00</td>\n",
" <td>2021-12-17 03:46:40.346096+01:00</td>\n",
" <td>51</td>\n",
" <td>1</td>\n",
" <td>repas brasserie lou-racing</td>\n",
" <td>2310</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2310</td>\n",
" <td>1</td>\n",
" <td>500b670b79aa592ecb06f4957800a752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>59812</td>\n",
" <td>2023-05-26 01:45:54.321665+02:00</td>\n",
" <td>2023-05-26 01:46:01.571397+02:00</td>\n",
" <td>1501</td>\n",
" <td>2</td>\n",
" <td>parking match 2</td>\n",
" <td>10185</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>10185</td>\n",
" <td>2</td>\n",
" <td>d5f62ed879867b8b51ed7b85f1fc3ab0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3424</td>\n",
" <td>2021-07-16 03:13:06.988358+02:00</td>\n",
" <td>2021-07-16 05:33:31.321933+02:00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>rugby + hockey sur glace</td>\n",
" <td>5</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>822b47176c355a647aa2dbdf8dfbc594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>21379</td>\n",
" <td>2021-12-23 02:37:22.948114+01:00</td>\n",
" <td>2021-12-23 02:38:20.726329+01:00</td>\n",
" <td>51</td>\n",
" <td>1</td>\n",
" <td>bloc des etudiants lou-racing</td>\n",
" <td>2562</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2562</td>\n",
" <td>1</td>\n",
" <td>17b91f19c71ff6287ffc1f44af952576</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id created_at updated_at \\\n",
"0 41542 2022-10-29 02:54:32.756920+02:00 2022-10-29 02:57:35.511792+02:00 \n",
"1 21068 2021-12-17 03:43:53.166446+01:00 2021-12-17 03:46:40.346096+01:00 \n",
"2 59812 2023-05-26 01:45:54.321665+02:00 2023-05-26 01:46:01.571397+02:00 \n",
"3 3424 2021-07-16 03:13:06.988358+02:00 2021-07-16 05:33:31.321933+02:00 \n",
"4 21379 2021-12-23 02:37:22.948114+01:00 2021-12-23 02:38:20.726329+01:00 \n",
"\n",
" season_id facility_id name event_type_id \\\n",
"0 52 1 match lou feminin - lons 5588 \n",
"1 51 1 repas brasserie lou-racing 2310 \n",
"2 1501 2 parking match 2 10185 \n",
"3 1 1 rugby + hockey sur glace 5 \n",
"4 51 1 bloc des etudiants lou-racing 2562 \n",
"\n",
" manual_added is_display event_type_key_id facility_key_id \\\n",
"0 False True 5588 1 \n",
"1 False True 2310 1 \n",
"2 False True 10185 2 \n",
"3 False True 5 1 \n",
"4 False True 2562 1 \n",
"\n",
" identifier \n",
"0 40cc5a346b1af4ee7108ac28b144fb77 \n",
"1 500b670b79aa592ecb06f4957800a752 \n",
"2 d5f62ed879867b8b51ed7b85f1fc3ab0 \n",
"3 822b47176c355a647aa2dbdf8dfbc594 \n",
"4 17b91f19c71ff6287ffc1f44af952576 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events = display_databases(\"8events.csv\")\n",
"events.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8event_types.csv\n",
"Shape : (73, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>fidelity_delay</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>standard</td>\n",
" <td>2021-06-29 13:52:10.434850+02:00</td>\n",
" <td>2021-06-29 13:52:10.434850+02:00</td>\n",
" <td>36</td>\n",
" <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11</td>\n",
" <td>ptit lou</td>\n",
" <td>2021-06-29 21:33:13.000743+02:00</td>\n",
" <td>2021-06-29 21:33:13.000743+02:00</td>\n",
" <td>36</td>\n",
" <td>dedd3579bc13b3ed7a90277247d9944b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>274</td>\n",
" <td>parking 19-20</td>\n",
" <td>2021-07-16 00:02:17.225410+02:00</td>\n",
" <td>2021-07-16 00:02:17.225410+02:00</td>\n",
" <td>36</td>\n",
" <td>0d348caeec0b66f9d4987dfbe30e1e8b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>129</td>\n",
" <td>events 2018-2019</td>\n",
" <td>2021-06-30 01:35:18.110429+02:00</td>\n",
" <td>2021-06-30 01:35:18.110429+02:00</td>\n",
" <td>36</td>\n",
" <td>65eb39ddf8f79d28d93c2f2c53118f50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>accreditations 2017-2018</td>\n",
" <td>2021-06-29 21:33:12.999510+02:00</td>\n",
" <td>2021-06-29 21:33:12.999510+02:00</td>\n",
" <td>36</td>\n",
" <td>732cfdcf2065fa0005faf42793ddd76c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 1 standard 2021-06-29 13:52:10.434850+02:00 \n",
"1 11 ptit lou 2021-06-29 21:33:13.000743+02:00 \n",
"2 274 parking 19-20 2021-07-16 00:02:17.225410+02:00 \n",
"3 129 events 2018-2019 2021-06-30 01:35:18.110429+02:00 \n",
"4 10 accreditations 2017-2018 2021-06-29 21:33:12.999510+02:00 \n",
"\n",
" updated_at fidelity_delay \\\n",
"0 2021-06-29 13:52:10.434850+02:00 36 \n",
"1 2021-06-29 21:33:13.000743+02:00 36 \n",
"2 2021-07-16 00:02:17.225410+02:00 36 \n",
"3 2021-06-30 01:35:18.110429+02:00 36 \n",
"4 2021-06-29 21:33:12.999510+02:00 36 \n",
"\n",
" identifier \n",
"0 c00f0c4675b91fb8b918e4079a0b1bac \n",
"1 dedd3579bc13b3ed7a90277247d9944b \n",
"2 0d348caeec0b66f9d4987dfbe30e1e8b \n",
"3 65eb39ddf8f79d28d93c2f2c53118f50 \n",
"4 732cfdcf2065fa0005faf42793ddd76c "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"event_types = display_databases(\"8event_types.csv\")\n",
"event_types.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8seasons.csv\n",
"Shape : (16, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>start_date_time</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1501</td>\n",
" <td>saison 2023-2024</td>\n",
" <td>2022-06-25 03:07:31.209270+02:00</td>\n",
" <td>2022-06-25 03:07:31.209270+02:00</td>\n",
" <td>NaN</td>\n",
" <td>71f5c069ce45c5e933dcc37c22507fbf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1194</td>\n",
" <td>saison 2049-2050</td>\n",
" <td>2022-02-17 03:24:23.942691+01:00</td>\n",
" <td>2022-02-17 03:24:23.942691+01:00</td>\n",
" <td>NaN</td>\n",
" <td>44e20620bbc5926db2e295d38b606afd</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>saison 2016-2017</td>\n",
" <td>2021-06-29 21:33:00.702563+02:00</td>\n",
" <td>2021-06-29 21:33:00.702563+02:00</td>\n",
" <td>NaN</td>\n",
" <td>f9cf989d4f49300220df67ef93aa2294</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>47</td>\n",
" <td>saison 2018-2019</td>\n",
" <td>2021-06-30 01:35:15.156097+02:00</td>\n",
" <td>2021-06-30 01:35:15.156097+02:00</td>\n",
" <td>NaN</td>\n",
" <td>eec50c35fbf8593b364ced287335d90c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100</td>\n",
" <td>saison 2010-2011</td>\n",
" <td>2021-07-16 00:23:27.607648+02:00</td>\n",
" <td>2021-07-16 00:23:27.607648+02:00</td>\n",
" <td>NaN</td>\n",
" <td>7ccc51049a85e0df9b80662e45b6ddb8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 1501 saison 2023-2024 2022-06-25 03:07:31.209270+02:00 \n",
"1 1194 saison 2049-2050 2022-02-17 03:24:23.942691+01:00 \n",
"2 2 saison 2016-2017 2021-06-29 21:33:00.702563+02:00 \n",
"3 47 saison 2018-2019 2021-06-30 01:35:15.156097+02:00 \n",
"4 100 saison 2010-2011 2021-07-16 00:23:27.607648+02:00 \n",
"\n",
" updated_at start_date_time \\\n",
"0 2022-06-25 03:07:31.209270+02:00 NaN \n",
"1 2022-02-17 03:24:23.942691+01:00 NaN \n",
"2 2021-06-29 21:33:00.702563+02:00 NaN \n",
"3 2021-06-30 01:35:15.156097+02:00 NaN \n",
"4 2021-07-16 00:23:27.607648+02:00 NaN \n",
"\n",
" identifier \n",
"0 71f5c069ce45c5e933dcc37c22507fbf \n",
"1 44e20620bbc5926db2e295d38b606afd \n",
"2 f9cf989d4f49300220df67ef93aa2294 \n",
"3 eec50c35fbf8593b364ced287335d90c \n",
"4 7ccc51049a85e0df9b80662e45b6ddb8 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seasons = display_databases(\"8seasons.csv\")\n",
"seasons.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/8/8facilities.csv\n",
"Shape : (5, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>street_id</th>\n",
" <th>fixed_capacity</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>74</td>\n",
" <td>plan pour campagne d'abo 2011/2012</td>\n",
" <td>2021-07-16 00:23:30.337698+02:00</td>\n",
" <td>2021-07-16 00:23:30.337698+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2e1d25d5f7e46e23c734fe0e4951390e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>accreditation</td>\n",
" <td>2021-06-29 21:33:13.018552+02:00</td>\n",
" <td>2021-06-29 21:33:13.018552+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>da37a04e592cbd344142730ce05a6887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>organisation match exterieur</td>\n",
" <td>2021-06-29 21:33:13.019878+02:00</td>\n",
" <td>2021-06-29 21:33:13.019878+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>8f9ee8c2e954585f7c68096d7f1cf4f1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>parking matmut stadium</td>\n",
" <td>2021-06-29 21:33:13.017165+02:00</td>\n",
" <td>2021-06-29 21:33:13.017165+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>aeab282982ea738674dbf5c3763a0be0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>matmut stadium</td>\n",
" <td>2021-06-29 21:33:13.004560+02:00</td>\n",
" <td>2021-06-29 21:33:13.004560+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>89feffd283ebdabdc3b81fb62ea4f6f0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 74 plan pour campagne d'abo 2011/2012 2021-07-16 00:23:30.337698+02:00 \n",
"1 3 accreditation 2021-06-29 21:33:13.018552+02:00 \n",
"2 4 organisation match exterieur 2021-06-29 21:33:13.019878+02:00 \n",
"3 2 parking matmut stadium 2021-06-29 21:33:13.017165+02:00 \n",
"4 1 matmut stadium 2021-06-29 21:33:13.004560+02:00 \n",
"\n",
" updated_at street_id fixed_capacity \\\n",
"0 2021-07-16 00:23:30.337698+02:00 1 NaN \n",
"1 2021-06-29 21:33:13.018552+02:00 1 NaN \n",
"2 2021-06-29 21:33:13.019878+02:00 1 NaN \n",
"3 2021-06-29 21:33:13.017165+02:00 1 NaN \n",
"4 2021-06-29 21:33:13.004560+02:00 1 NaN \n",
"\n",
" identifier \n",
"0 2e1d25d5f7e46e23c734fe0e4951390e \n",
"1 da37a04e592cbd344142730ce05a6887 \n",
"2 8f9ee8c2e954585f7c68096d7f1cf4f1 \n",
"3 aeab282982ea738674dbf5c3763a0be0 \n",
"4 89feffd283ebdabdc3b81fb62ea4f6f0 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"facilities = display_databases(\"8facilities.csv\")\n",
"facilities.head()"
]
},
{
"cell_type": "markdown",
"id": "c7467d41-0ded-465d-bb08-15be914a166b",
"metadata": {},
"source": [
"#### Analyze annexe databases"
]
},
{
"cell_type": "markdown",
"id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b",
"metadata": {},
"source": [
"Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc"
]
},
{
"cell_type": "markdown",
"id": "d3ec1040-48b2-40bb-8947-920ddb4589f3",
"metadata": {},
"source": [
"## II. Identify Commons Datasets"
]
},
{
"cell_type": "markdown",
"id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e",
"metadata": {},
"source": [
"From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
"metadata": {},
"outputs": [],
"source": [
"## We first construct a dictionary reporting all the datasets for each companies\n",
"\n",
"companies = fs.ls(BUCKET)\n",
"companies_database = {}\n",
"\n",
"for company in companies:\n",
" companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "54057367-9df9-42f4-aa07-bf524bb76462",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of databases : 30\n"
]
}
],
"source": [
"# Then we create a list of all database\n",
"\n",
"all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
"print(\"Number of databases : \",len(all_database))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "63914e20-9efc-4088-877b-edab5f225d00",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30\n",
"23\n"
]
}
],
"source": [
"## We then create a set of database in common for all companies\n",
"\n",
"data_in_common = set(all_database)\n",
"\n",
"print(len(data_in_common))\n",
"\n",
"for key in companies_database:\n",
" diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
" data_in_common = data_in_common - diff_database\n",
"\n",
"print(len(data_in_common))\n",
" "
]
},
{
"cell_type": "markdown",
"id": "676d8536-7d8c-4075-a357-b8d06e501ca8",
"metadata": {},
"source": [
"## Create Universal database"
]
},
{
"cell_type": "markdown",
"id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
"metadata": {},
"source": [
"We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
"\n",
"Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '1'"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
"metadata": {},
"outputs": [],
"source": [
"theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
"metadata": {},
"outputs": [],
"source": [
"def remove_horodates(df):\n",
" \"\"\"\n",
" this function remove horodate columns like created_at and updated_at\n",
" \"\"\"\n",
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2c478213-09ae-44ef-8c7c-125bcb571642",
"metadata": {},
"outputs": [],
"source": [
"def order_columns_id(df):\n",
" \"\"\"\n",
" this function puts all id columns at the beginning in order to read the dataset easier\n",
" \"\"\"\n",
" substring = 'id'\n",
" id_columns = [col for col in df.columns if substring in col]\n",
" remaining_col = [col for col in df.columns if substring not in col]\n",
" new_order = id_columns + remaining_col\n",
" return df[new_order]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
"metadata": {},
"outputs": [],
"source": [
"def percent_na(df):\n",
" \"\"\"\n",
" this function returns the percentage of na for each column\n",
" \"\"\"\n",
" percent_missing = df.isna().sum() * 100 / len(df)\n",
" return percent_missing"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
"metadata": {},
"outputs": [],
"source": [
"def process_df(df):\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" print(\"Percent of NA for each column : \", percent_na(df))\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
"metadata": {},
"source": [
"#### Deep analysis of products.csv"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "862a7658-0602-4d94-bb58-d23774c00d32",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1products.csv\n",
"Shape : (94803, 14)\n",
"Number of columns : 14\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>category_id</th>\n",
" <th>apply_price</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>extra_field</th>\n",
" <th>amount_consumption</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>2020-09-03 14:09:43.119798+02:00</td>\n",
" <td>2020-09-03 14:09:43.119798+02:00</td>\n",
" <td>41</td>\n",
" <td>0.0</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>2020-09-03 13:21:22.711773+02:00</td>\n",
" <td>2020-09-03 13:21:22.711773+02:00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8a179671ab198e570e6a104c4451379f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>2020-09-03 14:46:33.589030+02:00</td>\n",
" <td>2020-09-03 14:46:33.589030+02:00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>2022-01-28 19:29:23.525722+01:00</td>\n",
" <td>2022-01-28 19:29:23.525722+01:00</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>d865383579314b791aa4bcf3fb418f17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>8.5</td>\n",
" <td>False</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>2020-09-03 13:29:30.773089+02:00</td>\n",
" <td>2020-09-03 13:29:30.773089+02:00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id amount is_full_price representation_id pricing_formula_id \\\n",
"0 10682 9.0 False 914 114 \n",
"1 478 9.5 False 273 131 \n",
"2 20873 11.5 False 275 137 \n",
"3 157142 8.0 False 82519 9 \n",
"4 1341 8.5 False 9 93 \n",
"\n",
" created_at updated_at \\\n",
"0 2020-09-03 14:09:43.119798+02:00 2020-09-03 14:09:43.119798+02:00 \n",
"1 2020-09-03 13:21:22.711773+02:00 2020-09-03 13:21:22.711773+02:00 \n",
"2 2020-09-03 14:46:33.589030+02:00 2020-09-03 14:46:33.589030+02:00 \n",
"3 2022-01-28 19:29:23.525722+01:00 2022-01-28 19:29:23.525722+01:00 \n",
"4 2020-09-03 13:29:30.773089+02:00 2020-09-03 13:29:30.773089+02:00 \n",
"\n",
" category_id apply_price products_group_id product_pack_id extra_field \\\n",
"0 41 0.0 10655 1 NaN \n",
"1 1 0.0 471 1 NaN \n",
"2 1 0.0 20825 1 NaN \n",
"3 5 0.0 156773 1 NaN \n",
"4 1 0.0 1175 1 NaN \n",
"\n",
" amount_consumption identifier \n",
"0 NaN 35c88f2db8a63d7474e46eb8ca9260e7 \n",
"1 NaN 8a179671ab198e570e6a104c4451379f \n",
"2 NaN ee83779ce29e67ad251e40234b426d6a \n",
"3 NaN d865383579314b791aa4bcf3fb418f17 \n",
"4 NaN f1c4689bc47dee6f60b56d74b593dd46 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products = display_databases(\"1products.csv\")\n",
"print(\"Number of columns : \", len(products.columns))\n",
"products.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 12\n",
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>identifier</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>apply_price</th>\n",
" <th>extra_field</th>\n",
" <th>amount_consumption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>8a179671ab198e570e6a104c4451379f</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>d865383579314b791aa4bcf3fb418f17</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
" <td>8.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id identifier \\\n",
"0 10655 1 35c88f2db8a63d7474e46eb8ca9260e7 \n",
"1 471 1 8a179671ab198e570e6a104c4451379f \n",
"2 20825 1 ee83779ce29e67ad251e40234b426d6a \n",
"3 156773 1 d865383579314b791aa4bcf3fb418f17 \n",
"4 1175 1 f1c4689bc47dee6f60b56d74b593dd46 \n",
"\n",
" amount is_full_price apply_price extra_field amount_consumption \n",
"0 9.0 False 0.0 NaN NaN \n",
"1 9.5 False 0.0 NaN NaN \n",
"2 11.5 False 0.0 NaN NaN \n",
"3 8.0 False 0.0 NaN NaN \n",
"4 8.5 False 0.0 NaN NaN "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products = remove_horodates(products)\n",
"print(\"Number of columns : \", len(products.columns))\n",
"products = order_columns_id(products)\n",
"print(\"Columns : \", products.columns)\n",
"products.head()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id int64\n",
"representation_id int64\n",
"pricing_formula_id int64\n",
"category_id int64\n",
"products_group_id int64\n",
"product_pack_id int64\n",
"identifier object\n",
"amount float64\n",
"is_full_price bool\n",
"apply_price float64\n",
"extra_field float64\n",
"amount_consumption float64\n",
"dtype: object\n"
]
}
],
"source": [
"print(products.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "460749ac-aa26-4216-8667-518546f72f72",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0.0\n",
"representation_id 0.0\n",
"pricing_formula_id 0.0\n",
"category_id 0.0\n",
"products_group_id 0.0\n",
"product_pack_id 0.0\n",
"identifier 0.0\n",
"amount 0.0\n",
"is_full_price 0.0\n",
"apply_price 0.0\n",
"extra_field 100.0\n",
"amount_consumption 100.0\n",
"dtype: float64\n"
]
}
],
"source": [
"percent_missing = products.isna().sum() * 100 / len(products)\n",
"print(percent_missing)"
]
},
{
"cell_type": "markdown",
"id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
"metadata": {},
"source": [
"#### Deep analysis of categories.csv"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1categories.csv'"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 7\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>extra_field</th>\n",
" <th>quota</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30</td>\n",
" <td>en nb entrées gr</td>\n",
" <td>2020-09-03 13:21:20.019202+02:00</td>\n",
" <td>2020-09-03 13:21:20.019202+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>indiv activité enfant</td>\n",
" <td>2020-09-03 13:11:23.306968+02:00</td>\n",
" <td>2020-09-03 13:11:23.306968+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>39</td>\n",
" <td>indiv activité gr</td>\n",
" <td>2020-09-03 13:21:20.029901+02:00</td>\n",
" <td>2020-09-03 13:21:20.029901+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1108</td>\n",
" <td>groupe forfait adulte</td>\n",
" <td>2020-09-19 02:06:43.145697+02:00</td>\n",
" <td>2020-09-19 02:06:43.145697+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3edda20c877a93b5ff883827238eb711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>groupe forfait entrées tr</td>\n",
" <td>2020-09-03 13:11:23.264997+02:00</td>\n",
" <td>2020-09-03 13:11:23.264997+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 30 en nb entrées gr 2020-09-03 13:21:20.019202+02:00 \n",
"1 16 indiv activité enfant 2020-09-03 13:11:23.306968+02:00 \n",
"2 39 indiv activité gr 2020-09-03 13:21:20.029901+02:00 \n",
"3 1108 groupe forfait adulte 2020-09-19 02:06:43.145697+02:00 \n",
"4 6 groupe forfait entrées tr 2020-09-03 13:11:23.264997+02:00 \n",
"\n",
" updated_at extra_field quota \\\n",
"0 2020-09-03 13:21:20.019202+02:00 NaN NaN \n",
"1 2020-09-03 13:11:23.306968+02:00 NaN NaN \n",
"2 2020-09-03 13:21:20.029901+02:00 NaN NaN \n",
"3 2020-09-19 02:06:43.145697+02:00 NaN NaN \n",
"4 2020-09-03 13:11:23.264997+02:00 NaN NaN \n",
"\n",
" identifier \n",
"0 849ab2791a14f5fc2bb4d87ab2b78bf6 \n",
"1 425fd2f01984cc4ba030c1be98f42c33 \n",
"2 9244dd3738788db0d22a5d0afe687b69 \n",
"3 3edda20c877a93b5ff883827238eb711 \n",
"4 ff48df4b2dd5a14116bf4d280b31621e "
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"identifier 0.000000\n",
"name 3.703704\n",
"extra_field 100.000000\n",
"quota 100.000000\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" <th>quota</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30</td>\n",
" <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
" <td>en nb entrées gr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
" <td>indiv activité enfant</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>39</td>\n",
" <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
" <td>indiv activité gr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1108</td>\n",
" <td>3edda20c877a93b5ff883827238eb711</td>\n",
" <td>groupe forfait adulte</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
" <td>groupe forfait entrées tr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id identifier name \\\n",
"0 30 849ab2791a14f5fc2bb4d87ab2b78bf6 en nb entrées gr \n",
"1 16 425fd2f01984cc4ba030c1be98f42c33 indiv activité enfant \n",
"2 39 9244dd3738788db0d22a5d0afe687b69 indiv activité gr \n",
"3 1108 3edda20c877a93b5ff883827238eb711 groupe forfait adulte \n",
"4 6 ff48df4b2dd5a14116bf4d280b31621e groupe forfait entrées tr \n",
"\n",
" extra_field quota \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"identifier object\n",
"name object\n",
"extra_field float64\n",
"quota float64\n",
"dtype: object"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
"metadata": {},
"source": [
"#### Deep analysis of type_of_categories.csv"
]
},
{
"cell_type": "markdown",
"id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
"metadata": {},
"source": [
"#### Deep analysis of representation_category_capacities.csv"
]
},
{
"cell_type": "markdown",
"id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
"metadata": {},
"source": [
"#### Deep analysis of representations.csv"
]
},
{
"cell_type": "markdown",
"id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
"metadata": {},
"source": [
"#### Deep analysis of events.csv"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1events.csv'"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "6cab507d-8b11-404d-9286-5cc205228af9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1events.csv\n",
"Shape : (1232, 12)\n",
"Number of columns : 12\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>name</th>\n",
" <th>event_type_id</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>192</td>\n",
" <td>2020-09-03 13:36:42.216991+02:00</td>\n",
" <td>2021-11-02 15:06:40.663219+01:00</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>frontières</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>c1cecd093146068fd57896e254e98170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>30329</td>\n",
" <td>2023-11-04 02:50:34.602462+01:00</td>\n",
" <td>2023-11-04 02:52:26.138154+01:00</td>\n",
" <td>2767</td>\n",
" <td>1</td>\n",
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
" <td>5</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>f510a6710878d7aca36e71c54abab525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>161</td>\n",
" <td>2020-09-03 13:29:27.944002+02:00</td>\n",
" <td>2021-11-02 15:06:40.652026+01:00</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>visite contée les chercheurs d'or indiv</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5957</td>\n",
" <td>2021-07-31 11:16:42.575583+02:00</td>\n",
" <td>2021-11-02 15:06:40.663219+01:00</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8337</td>\n",
" <td>2021-08-17 13:40:34.111923+02:00</td>\n",
" <td>2021-11-02 15:06:40.663219+01:00</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>jeff koons épisodes 4</td>\n",
" <td>4</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id created_at updated_at \\\n",
"0 192 2020-09-03 13:36:42.216991+02:00 2021-11-02 15:06:40.663219+01:00 \n",
"1 30329 2023-11-04 02:50:34.602462+01:00 2023-11-04 02:52:26.138154+01:00 \n",
"2 161 2020-09-03 13:29:27.944002+02:00 2021-11-02 15:06:40.652026+01:00 \n",
"3 5957 2021-07-31 11:16:42.575583+02:00 2021-11-02 15:06:40.663219+01:00 \n",
"4 8337 2021-08-17 13:40:34.111923+02:00 2021-11-02 15:06:40.663219+01:00 \n",
"\n",
" season_id facility_id name \\\n",
"0 16 1 frontières \n",
"1 2767 1 visite guidée une autre histoire du monde (1h00) \n",
"2 16 1 visite contée les chercheurs d'or indiv \n",
"3 582 1 we dreamt of utopia and we woke up screaming. \n",
"4 582 1 jeff koons épisodes 4 \n",
"\n",
" event_type_id manual_added is_display event_type_key_id \\\n",
"0 4 False True 4 \n",
"1 5 False True 5 \n",
"2 2 False True 2 \n",
"3 4 False True 4 \n",
"4 4 False True 4 \n",
"\n",
" facility_key_id identifier \n",
"0 1 c1cecd093146068fd57896e254e98170 \n",
"1 1 f510a6710878d7aca36e71c54abab525 \n",
"2 1 21177fa9acad1ae2b1f595690fb853d3 \n",
"3 1 962601f1eb153d45d49437f8fe839f7f \n",
"4 1 bfa22f5a2364a2dacfc45cca1c8d3215 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "9fe57873-8108-44c9-b8a5-f58d3cbb6d17",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 10\n",
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
" dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"season_id 0.000000\n",
"facility_id 0.000000\n",
"event_type_id 0.000000\n",
"event_type_key_id 0.000000\n",
"facility_key_id 0.000000\n",
"identifier 0.000000\n",
"name 0.974026\n",
"manual_added 0.000000\n",
"is_display 0.000000\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>event_type_id</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>192</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>c1cecd093146068fd57896e254e98170</td>\n",
" <td>frontières</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>30329</td>\n",
" <td>2767</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>f510a6710878d7aca36e71c54abab525</td>\n",
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>161</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
" <td>visite contée les chercheurs d'or indiv</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5957</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8337</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
" <td>jeff koons épisodes 4</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id season_id facility_id event_type_id event_type_key_id \\\n",
"0 192 16 1 4 4 \n",
"1 30329 2767 1 5 5 \n",
"2 161 16 1 2 2 \n",
"3 5957 582 1 4 4 \n",
"4 8337 582 1 4 4 \n",
"\n",
" facility_key_id identifier \\\n",
"0 1 c1cecd093146068fd57896e254e98170 \n",
"1 1 f510a6710878d7aca36e71c54abab525 \n",
"2 1 21177fa9acad1ae2b1f595690fb853d3 \n",
"3 1 962601f1eb153d45d49437f8fe839f7f \n",
"4 1 bfa22f5a2364a2dacfc45cca1c8d3215 \n",
"\n",
" name manual_added is_display \n",
"0 frontières False True \n",
"1 visite guidée une autre histoire du monde (1h00) False True \n",
"2 visite contée les chercheurs d'or indiv False True \n",
"3 we dreamt of utopia and we woke up screaming. False True \n",
"4 jeff koons épisodes 4 False True "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "7fd9e5bd-baac-4b3b-9ffb-5a9baa18399b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"season_id int64\n",
"facility_id int64\n",
"event_type_id int64\n",
"event_type_key_id int64\n",
"facility_key_id int64\n",
"identifier object\n",
"name object\n",
"manual_added bool\n",
"is_display bool\n",
"dtype: object"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "24186efa-5908-4b03-bf52-96415fc8bd54",
"metadata": {},
"source": [
"#### Deep analysis of event_types.csv"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "90ab62d4-a086-4469-961c-67eefb375388",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1event_types.csv'"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "58db1751-fd56-4c28-b49e-bc8235bb0dc8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1event_types.csv\n",
"Shape : (9, 6)\n",
"Number of columns : 6\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>fidelity_delay</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>standard</td>\n",
" <td>2020-09-03 12:24:22.574262+02:00</td>\n",
" <td>2020-09-03 12:24:22.574262+02:00</td>\n",
" <td>36</td>\n",
" <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>66</td>\n",
" <td>package</td>\n",
" <td>2020-09-03 14:05:04.648137+02:00</td>\n",
" <td>2020-09-03 14:05:04.648137+02:00</td>\n",
" <td>36</td>\n",
" <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>83</td>\n",
" <td>guide multimédias</td>\n",
" <td>2020-09-03 14:15:17.252539+02:00</td>\n",
" <td>2020-09-03 14:15:17.252539+02:00</td>\n",
" <td>36</td>\n",
" <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>non défini</td>\n",
" <td>2020-09-03 13:11:23.117024+02:00</td>\n",
" <td>2020-09-03 13:11:23.117024+02:00</td>\n",
" <td>36</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2723</td>\n",
" <td>NaN</td>\n",
" <td>2021-12-22 09:45:47.715105+01:00</td>\n",
" <td>2021-12-22 09:45:47.715105+01:00</td>\n",
" <td>36</td>\n",
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 1 standard 2020-09-03 12:24:22.574262+02:00 \n",
"1 66 package 2020-09-03 14:05:04.648137+02:00 \n",
"2 83 guide multimédias 2020-09-03 14:15:17.252539+02:00 \n",
"3 3 non défini 2020-09-03 13:11:23.117024+02:00 \n",
"4 2723 NaN 2021-12-22 09:45:47.715105+01:00 \n",
"\n",
" updated_at fidelity_delay \\\n",
"0 2020-09-03 12:24:22.574262+02:00 36 \n",
"1 2020-09-03 14:05:04.648137+02:00 36 \n",
"2 2020-09-03 14:15:17.252539+02:00 36 \n",
"3 2020-09-03 13:11:23.117024+02:00 36 \n",
"4 2021-12-22 09:45:47.715105+01:00 36 \n",
"\n",
" identifier \n",
"0 c00f0c4675b91fb8b918e4079a0b1bac \n",
"1 efe90a8e604a7c840e88d03a67f6b7d8 \n",
"2 ee14c62b3b9f6c7dd5401685a18e4460 \n",
"3 52ff3466787b4d538407372e5f7afe0f \n",
"4 d41d8cd98f00b204e9800998ecf8427e "
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "ac93382c-0b5f-462d-8021-0dd1e7201b8c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 4\n",
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"fidelity_delay 0.000000\n",
"identifier 0.000000\n",
"name 11.111111\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>fidelity_delay</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
" <td>standard</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>66</td>\n",
" <td>36</td>\n",
" <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
" <td>package</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>83</td>\n",
" <td>36</td>\n",
" <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
" <td>guide multimédias</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>36</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" <td>non défini</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2723</td>\n",
" <td>36</td>\n",
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id fidelity_delay identifier name\n",
"0 1 36 c00f0c4675b91fb8b918e4079a0b1bac standard\n",
"1 66 36 efe90a8e604a7c840e88d03a67f6b7d8 package\n",
"2 83 36 ee14c62b3b9f6c7dd5401685a18e4460 guide multimédias\n",
"3 3 36 52ff3466787b4d538407372e5f7afe0f non défini\n",
"4 2723 36 d41d8cd98f00b204e9800998ecf8427e NaN"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "18cbd630-3c7d-49e1-932b-9460badf3758",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"fidelity_delay int64\n",
"identifier object\n",
"name object\n",
"dtype: object"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "5847a441-31b9-4802-a5ae-90d8c6d6e153",
"metadata": {},
"source": [
"#### Deep analysis of seasons.csv"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "ae544dcc-f23d-4216-bb5b-597cc1b3765e",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1seasons.csv'"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "1ac97963-9208-4329-be41-d71a5797487f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1seasons.csv\n",
"Shape : (13, 6)\n",
"Number of columns : 6\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>start_date_time</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>943</td>\n",
" <td>2013</td>\n",
" <td>2021-07-29 08:55:33.282607+02:00</td>\n",
" <td>2021-07-29 08:55:33.282607+02:00</td>\n",
" <td>NaN</td>\n",
" <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>129</td>\n",
" <td>2014</td>\n",
" <td>2020-09-03 15:13:08.105567+02:00</td>\n",
" <td>2020-09-03 15:13:08.105567+02:00</td>\n",
" <td>NaN</td>\n",
" <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2015</td>\n",
" <td>2020-09-03 13:11:19.405037+02:00</td>\n",
" <td>2020-09-03 13:11:19.405037+02:00</td>\n",
" <td>NaN</td>\n",
" <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>2016</td>\n",
" <td>2020-09-03 13:11:19.401001+02:00</td>\n",
" <td>2020-09-03 13:11:19.401001+02:00</td>\n",
" <td>NaN</td>\n",
" <td>95192c98732387165bf8e396c0f2dad2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2017</td>\n",
" <td>2020-09-03 13:11:19.409005+02:00</td>\n",
" <td>2020-09-03 13:11:19.409005+02:00</td>\n",
" <td>NaN</td>\n",
" <td>8d8818c8e140c64c743113f563cf750f</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 943 2013 2021-07-29 08:55:33.282607+02:00 \n",
"1 129 2014 2020-09-03 15:13:08.105567+02:00 \n",
"2 3 2015 2020-09-03 13:11:19.405037+02:00 \n",
"3 2 2016 2020-09-03 13:11:19.401001+02:00 \n",
"4 4 2017 2020-09-03 13:11:19.409005+02:00 \n",
"\n",
" updated_at start_date_time \\\n",
"0 2021-07-29 08:55:33.282607+02:00 NaN \n",
"1 2020-09-03 15:13:08.105567+02:00 NaN \n",
"2 2020-09-03 13:11:19.405037+02:00 NaN \n",
"3 2020-09-03 13:11:19.401001+02:00 NaN \n",
"4 2020-09-03 13:11:19.409005+02:00 NaN \n",
"\n",
" identifier \n",
"0 8038da89e49ac5eabb489cfc6cea9fc1 \n",
"1 cee8d6b7ce52554fd70354e37bbf44a2 \n",
"2 65d2ea03425887a717c435081cfc5dbb \n",
"3 95192c98732387165bf8e396c0f2dad2 \n",
"4 8d8818c8e140c64c743113f563cf750f "
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "b4593d46-105c-47dd-aa71-babd8e63e65b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 4\n",
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"identifier 0.000000\n",
"name 7.692308\n",
"start_date_time 100.000000\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" <th>start_date_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>943</td>\n",
" <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
" <td>2013</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>129</td>\n",
" <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
" <td>2015</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>95192c98732387165bf8e396c0f2dad2</td>\n",
" <td>2016</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>8d8818c8e140c64c743113f563cf750f</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id identifier name start_date_time\n",
"0 943 8038da89e49ac5eabb489cfc6cea9fc1 2013 NaN\n",
"1 129 cee8d6b7ce52554fd70354e37bbf44a2 2014 NaN\n",
"2 3 65d2ea03425887a717c435081cfc5dbb 2015 NaN\n",
"3 2 95192c98732387165bf8e396c0f2dad2 2016 NaN\n",
"4 4 8d8818c8e140c64c743113f563cf750f 2017 NaN"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "5d3b096d-8e73-4514-94e5-f2dcd4d0a89c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"identifier object\n",
"name object\n",
"start_date_time float64\n",
"dtype: object"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "a7b00bc7-eae6-457c-ac68-a4a55a6d1c8c",
"metadata": {},
"source": [
"#### Deep Analysis of facilities.csv"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "d95ef015-d44c-4353-8761-771b910d21c9",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1facilities.csv'"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "ef5fe794-8df7-4f27-8554-ecdc4074ac0b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1facilities.csv\n",
"Shape : (2, 7)\n",
"Number of columns : 7\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>street_id</th>\n",
" <th>fixed_capacity</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>non défini</td>\n",
" <td>2020-09-03 13:16:35.293111+02:00</td>\n",
" <td>2020-09-03 13:16:35.293111+02:00</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>mucem</td>\n",
" <td>2020-09-03 13:11:23.133059+02:00</td>\n",
" <td>2020-09-03 13:11:23.133059+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 2 non défini 2020-09-03 13:16:35.293111+02:00 \n",
"1 1 mucem 2020-09-03 13:11:23.133059+02:00 \n",
"\n",
" updated_at street_id fixed_capacity \\\n",
"0 2020-09-03 13:16:35.293111+02:00 2 NaN \n",
"1 2020-09-03 13:11:23.133059+02:00 1 NaN \n",
"\n",
" identifier \n",
"0 52ff3466787b4d538407372e5f7afe0f \n",
"1 702bd76fe3dd5dbcf118a6965a946f54 "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "e3621201-fab9-49fd-95c1-0b9d5da76e50",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 5\n",
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n",
"Percent of NA for each column : id 0.0\n",
"street_id 0.0\n",
"identifier 0.0\n",
"name 0.0\n",
"fixed_capacity 100.0\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>street_id</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" <th>fixed_capacity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" <td>non défini</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id street_id identifier name fixed_capacity\n",
"0 2 2 52ff3466787b4d538407372e5f7afe0f non défini NaN\n",
"1 1 1 702bd76fe3dd5dbcf118a6965a946f54 mucem NaN"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "1b198b92-8654-4531-a0dd-8f2e01c2e6c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"street_id int64\n",
"identifier object\n",
"name object\n",
"fixed_capacity float64\n",
"dtype: object"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "ab5c4c2d-3e04-457d-a183-e173df89b650",
"metadata": {},
"source": [
"## Merge"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "43576244-c8cf-4ca0-b056-7aea1fbf0bc7",
"metadata": {},
"outputs": [],
"source": [
"def process_df_2(df):\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "0fad097e-474c-4af7-b1e1-7d8dda3f09ea",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset(name):\n",
" df = display_databases(name)\n",
" df = process_df_2(df)\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "b60034ef-fdd6-4640-a012-cf74c17b333f",
"metadata": {},
"source": [
"### Products Table"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "6213b1eb-c5f8-49dd-ab69-366542380e80",
"metadata": {},
"outputs": [],
"source": [
"def create_products_table():\n",
" # first merge products and categories\n",
" print(\"first merge products and categories\")\n",
" products = load_dataset(\"1products.csv\")\n",
" categories = load_dataset(\"1categories.csv\")\n",
" products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'id', suffixes=('_products', '_categories'))\n",
" products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
" # Second merge products_theme and type of categories\n",
" print(\"Second merge products_theme and type of categories\")\n",
" type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
" type_of_categories = type_of_categories.drop(columns = 'id')\n",
" products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
" right_on = 'category_id' )\n",
"\n",
" # Index cleaning\n",
" products_theme = products_theme.drop(columns = ['id_categories'])\n",
" products_theme = order_columns_id(products_theme)\n",
"\n",
" return products_theme"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "b853e020-f73d-44e8-b086-e5548ce21011",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first merge products and categories\n",
"File path : bdc2324-data/1/1products.csv\n",
"Shape : (94803, 14)\n",
"Number of columns : 12\n",
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Second merge products_theme and type of categories\n",
"File path : bdc2324-data/1/1type_of_categories.csv\n",
"Shape : (5, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_products</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>type_of_id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>apply_price</th>\n",
" <th>extra_field_products</th>\n",
" <th>amount_consumption</th>\n",
" <th>name_categories</th>\n",
" <th>extra_field_categories</th>\n",
" <th>quota</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>indiv activité tr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>indiv entrées tr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>8.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
"0 10655 1 NaN 9.0 False \n",
"1 471 1 12.0 9.5 False \n",
"2 20825 1 12.0 11.5 False \n",
"3 156773 1 NaN 8.0 False \n",
"4 1175 1 12.0 8.5 False \n",
"\n",
" apply_price extra_field_products amount_consumption name_categories \\\n",
"0 0.0 NaN NaN indiv activité tr \n",
"1 0.0 NaN NaN indiv entrées tp \n",
"2 0.0 NaN NaN indiv entrées tp \n",
"3 0.0 NaN NaN indiv entrées tr \n",
"4 0.0 NaN NaN indiv entrées tp \n",
"\n",
" extra_field_categories quota \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_theme = create_products_table()\n",
"products_theme.head()"
]
},
{
"cell_type": "markdown",
"id": "8bd7b7ab-fd04-48d2-898b-48c5815457f3",
"metadata": {},
"source": [
"### Events Table"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "6ed0ad20-8315-4112-9a85-10e5f04ef852",
"metadata": {},
"outputs": [],
"source": [
"def create_events_table():\n",
" # first merge events and seasons : \n",
" print(\"first merge events and seasons : \")\n",
" events = load_dataset(\"1events.csv\")\n",
" seasons = load_dataset(\"1seasons.csv\")\n",
" events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
"\n",
" # Secondly merge events_theme and event_types\n",
" print(\"Secondly merge events_theme and event_types : \")\n",
" event_types = load_dataset(\"1event_types.csv\")\n",
"\n",
" events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # thirdly merge events_theme and facilities\n",
" print(\"thirdly merge events_theme and facilities : \")\n",
" facilities = load_dataset(\"1facilities.csv\")\n",
" events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
" events_theme = events_theme.rename(columns = {\"name\" : \"name_facilties\", \"id_events\" : \"event_id\"})\n",
" events_theme = events_theme.drop(columns = 'id')\n",
"\n",
" # Index cleaning\n",
" events_theme = events_theme.drop(columns = ['id_seasons'])\n",
" events_theme = order_columns_id(events_theme)\n",
" return events_theme"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "98ef0636-8c45-4a23-a62a-1fbe1544f8ce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first merge events and seasons : \n",
"File path : bdc2324-data/1/1events.csv\n",
"Shape : (1232, 12)\n",
"Number of columns : 10\n",
"Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
" 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1seasons.csv\n",
"Shape : (13, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
"Secondly merge events_theme and event_types : \n",
"File path : bdc2324-data/1/1event_types.csv\n",
"Shape : (9, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
"thirdly merge events_theme and facilities : \n",
"File path : bdc2324-data/1/1facilities.csv\n",
"Shape : (2, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_id</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>event_type_id</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>fidelity_delay</th>\n",
" <th>street_id</th>\n",
" <th>name_events</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilties</th>\n",
" <th>fixed_capacity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>192</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>frontières</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2018</td>\n",
" <td>NaN</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>30329</td>\n",
" <td>2767</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>visite guidée une autre histoire du monde (1h00)</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2023</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>161</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>visite contée les chercheurs d'or indiv</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2018</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5957</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>we dreamt of utopia and we woke up screaming.</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2021</td>\n",
" <td>NaN</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8337</td>\n",
" <td>582</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>jeff koons épisodes 4</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2021</td>\n",
" <td>NaN</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_id season_id facility_id event_type_id event_type_key_id \\\n",
"0 192 16 1 4 4 \n",
"1 30329 2767 1 5 5 \n",
"2 161 16 1 2 2 \n",
"3 5957 582 1 4 4 \n",
"4 8337 582 1 4 4 \n",
"\n",
" facility_key_id fidelity_delay street_id \\\n",
"0 1 36 1 \n",
"1 1 36 1 \n",
"2 1 36 1 \n",
"3 1 36 1 \n",
"4 1 36 1 \n",
"\n",
" name_events manual_added is_display \\\n",
"0 frontières False True \n",
"1 visite guidée une autre histoire du monde (1h00) False True \n",
"2 visite contée les chercheurs d'or indiv False True \n",
"3 we dreamt of utopia and we woke up screaming. False True \n",
"4 jeff koons épisodes 4 False True \n",
"\n",
" name_seasons start_date_time name_event_types name_facilties \\\n",
"0 2018 NaN spectacle vivant mucem \n",
"1 2023 NaN offre muséale groupe mucem \n",
"2 2018 NaN offre muséale individuel mucem \n",
"3 2021 NaN spectacle vivant mucem \n",
"4 2021 NaN spectacle vivant mucem \n",
"\n",
" fixed_capacity \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events_theme= create_events_table()\n",
"events_theme.head()"
]
},
{
"cell_type": "markdown",
"id": "4ad5b680-bb27-4f86-a5f3-7ff4fd1be96a",
"metadata": {},
"source": [
"## Representations_Table"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "481dddd6-80a8-4b9e-a05e-ed06fa3ed7a6",
"metadata": {},
"outputs": [],
"source": [
"def create_representations_table():\n",
" representations = load_dataset(\"1representations.csv\")\n",
" representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
"\n",
" representations_theme = representations.merge(representations_capacity, how='left',\n",
" left_on='id', right_on='representation_id',\n",
" suffixes=('_representation', '_representation_cap'))\n",
" # index cleaning\n",
" representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
" representations_theme = order_columns_id(representations_theme)\n",
" return representations_theme"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "677f4ed8-ef58-45f2-9056-ede0898c6a64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1representations.csv\n",
"Shape : (36095, 16)\n",
"Number of columns : 14\n",
"Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
" 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
" dtype='object')\n",
"File path : bdc2324-data/1/1representation_category_capacities.csv\n",
"Shape : (65241, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
" 'max_filling'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_id</th>\n",
" <th>representation_type_id</th>\n",
" <th>id_representation_cap</th>\n",
" <th>representation_id</th>\n",
" <th>category_id</th>\n",
" <th>serial</th>\n",
" <th>start_date_time</th>\n",
" <th>open</th>\n",
" <th>satisfaction</th>\n",
" <th>end_date_time</th>\n",
" <th>name</th>\n",
" <th>is_display</th>\n",
" <th>expected_filling_representation</th>\n",
" <th>max_filling_representation</th>\n",
" <th>extra_field</th>\n",
" <th>expected_filling_representation_cap</th>\n",
" <th>max_filling_representation_cap</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12384</td>\n",
" <td>NaN</td>\n",
" <td>123058</td>\n",
" <td>84820</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>2018-09-26 15:15:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>2514</td>\n",
" <td>269</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 17:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 18:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>384</td>\n",
" <td>269</td>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 17:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 18:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>2515</td>\n",
" <td>269</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 17:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 18:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>383</td>\n",
" <td>269</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 17:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-27 18:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_id representation_type_id id_representation_cap representation_id \\\n",
"0 12384 NaN 123058 84820 \n",
"1 37 NaN 2514 269 \n",
"2 37 NaN 384 269 \n",
"3 37 NaN 2515 269 \n",
"4 37 NaN 383 269 \n",
"\n",
" category_id serial start_date_time open satisfaction \\\n",
"0 2 NaN 2018-09-26 15:15:00+02:00 True NaN \n",
"1 2 NaN 2016-04-27 17:00:00+02:00 True NaN \n",
"2 5 NaN 2016-04-27 17:00:00+02:00 True NaN \n",
"3 10 NaN 2016-04-27 17:00:00+02:00 True NaN \n",
"4 1 NaN 2016-04-27 17:00:00+02:00 True NaN \n",
"\n",
" end_date_time name is_display \\\n",
"0 1901-01-01 00:09:21+00:09 NaN True \n",
"1 2016-04-27 18:00:00+02:00 NaN True \n",
"2 2016-04-27 18:00:00+02:00 NaN True \n",
"3 2016-04-27 18:00:00+02:00 NaN True \n",
"4 2016-04-27 18:00:00+02:00 NaN True \n",
"\n",
" expected_filling_representation max_filling_representation extra_field \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" expected_filling_representation_cap max_filling_representation_cap \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"representation_theme = create_representations_table()\n",
"representation_theme.head()"
]
},
{
"cell_type": "markdown",
"id": "71c26a38-6818-42df-8aee-0135681a5563",
"metadata": {},
"source": [
"## Uniform Products theme database"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "b26f4e7e-134d-4e32-a615-4b0e6bb80b25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field_products',\n",
" 'amount_consumption', 'name_categories', 'extra_field_categories',\n",
" 'quota'],\n",
" dtype='object')\n",
"\n",
" Representation theme columns : Index(['event_id', 'representation_type_id', 'id_representation_cap',\n",
" 'representation_id', 'category_id', 'serial', 'start_date_time', 'open',\n",
" 'satisfaction', 'end_date_time', 'name', 'is_display',\n",
" 'expected_filling_representation', 'max_filling_representation',\n",
" 'extra_field', 'expected_filling_representation_cap',\n",
" 'max_filling_representation_cap'],\n",
" dtype='object')\n",
"\n",
" Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
" 'event_type_key_id', 'facility_key_id', 'fidelity_delay', 'street_id',\n",
" 'name_events', 'manual_added', 'is_display', 'name_seasons',\n",
" 'start_date_time', 'name_event_types', 'name_facilties',\n",
" 'fixed_capacity'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(\"Products theme columns : \", products_theme.columns)\n",
"print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
"print(\"\\n Events theme columns : \", events_theme.columns)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "d40b1e3b-b1f3-4915-8ebc-6bb7856da42a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_products</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>type_of_id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>apply_price</th>\n",
" <th>...</th>\n",
" <th>open</th>\n",
" <th>satisfaction</th>\n",
" <th>end_date_time</th>\n",
" <th>name</th>\n",
" <th>is_display</th>\n",
" <th>expected_filling_representation</th>\n",
" <th>max_filling_representation</th>\n",
" <th>extra_field</th>\n",
" <th>expected_filling_representation_cap</th>\n",
" <th>max_filling_representation_cap</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>9.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2017-11-19 16:30:00+01:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>9.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-28 16:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>11.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>2016-04-28 14:00:00+02:00</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>8.5</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id type_of_id amount is_full_price \\\n",
"0 10655 1 NaN 9.0 False \n",
"1 471 1 12.0 9.5 False \n",
"2 20825 1 12.0 11.5 False \n",
"3 156773 1 NaN 8.0 False \n",
"4 1175 1 12.0 8.5 False \n",
"\n",
" apply_price ... open satisfaction end_date_time name \\\n",
"0 0.0 ... True NaN 2017-11-19 16:30:00+01:00 NaN \n",
"1 0.0 ... True NaN 2016-04-28 16:00:00+02:00 NaN \n",
"2 0.0 ... True NaN 2016-04-28 14:00:00+02:00 NaN \n",
"3 0.0 ... True NaN 1901-01-01 00:09:21+00:09 NaN \n",
"4 0.0 ... True NaN 1901-01-01 00:09:21+00:09 NaN \n",
"\n",
" is_display expected_filling_representation max_filling_representation \\\n",
"0 True NaN NaN \n",
"1 True NaN NaN \n",
"2 True NaN NaN \n",
"3 True NaN NaN \n",
"4 True NaN NaN \n",
"\n",
" extra_field expected_filling_representation_cap \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" max_filling_representation_cap \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_global = products_theme.merge(representation_theme, how='left',\n",
" on= [\"representation_id\", \"category_id\"])\n",
"\n",
"\n",
"products_global.head()"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "78d75a08-e959-429c-847a-7d70a2804806",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_products</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>category_id</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>type_of_id</th>\n",
" <th>event_id</th>\n",
" <th>representation_type_id</th>\n",
" <th>id_representation_cap</th>\n",
" <th>...</th>\n",
" <th>expected_filling_representation_cap</th>\n",
" <th>max_filling_representation_cap</th>\n",
" <th>name_events</th>\n",
" <th>manual_added</th>\n",
" <th>is_display_event</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time_event</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilties</th>\n",
" <th>fixed_capacity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10682</td>\n",
" <td>914</td>\n",
" <td>114</td>\n",
" <td>41</td>\n",
" <td>10655</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>132</td>\n",
" <td>NaN</td>\n",
" <td>8789</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>visite-jeu \"le classico des minots\" (1h30)</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>478</td>\n",
" <td>273</td>\n",
" <td>131</td>\n",
" <td>1</td>\n",
" <td>471</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>390</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>billet mucem picasso</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2016</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20873</td>\n",
" <td>275</td>\n",
" <td>137</td>\n",
" <td>1</td>\n",
" <td>20825</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>37</td>\n",
" <td>NaN</td>\n",
" <td>395</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>billet mucem picasso</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2016</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>157142</td>\n",
" <td>82519</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>156773</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>12365</td>\n",
" <td>NaN</td>\n",
" <td>120199</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1341</td>\n",
" <td>9</td>\n",
" <td>93</td>\n",
" <td>1</td>\n",
" <td>1175</td>\n",
" <td>1</td>\n",
" <td>12.0</td>\n",
" <td>8</td>\n",
" <td>NaN</td>\n",
" <td>21</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>non défini</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>non défini</td>\n",
" <td>mucem</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 45 columns</p>\n",
"</div>"
],
"text/plain": [
" id_products representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id type_of_id event_id \\\n",
"0 10655 1 NaN 132 \n",
"1 471 1 12.0 37 \n",
"2 20825 1 12.0 37 \n",
"3 156773 1 NaN 12365 \n",
"4 1175 1 12.0 8 \n",
"\n",
" representation_type_id id_representation_cap ... \\\n",
"0 NaN 8789 ... \n",
"1 NaN 390 ... \n",
"2 NaN 395 ... \n",
"3 NaN 120199 ... \n",
"4 NaN 21 ... \n",
"\n",
" expected_filling_representation_cap max_filling_representation_cap \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" name_events manual_added is_display_event \\\n",
"0 visite-jeu \"le classico des minots\" (1h30) False True \n",
"1 billet mucem picasso False True \n",
"2 billet mucem picasso False True \n",
"3 NaN False False \n",
"4 non défini False True \n",
"\n",
" name_seasons start_date_time_event name_event_types \\\n",
"0 2017 NaN offre muséale individuel \n",
"1 2016 NaN offre muséale individuel \n",
"2 2016 NaN offre muséale individuel \n",
"3 NaN NaN offre muséale individuel \n",
"4 2017 NaN non défini \n",
"\n",
" name_facilties fixed_capacity \n",
"0 mucem NaN \n",
"1 mucem NaN \n",
"2 mucem NaN \n",
"3 mucem NaN \n",
"4 mucem NaN \n",
"\n",
"[5 rows x 45 columns]"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
" suffixes = (\"_representation\", \"_event\"))\n",
"products_global = order_columns_id(products_global)\n",
"products_global.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b18f6428-90e0-4b1b-9b8d-bad995fb6c98",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}