BDC-team-1/Clean-Notebook.ipynb
2024-01-02 21:45:25 +00:00

5327 lines
206 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"## Exemple sur bdc2324-data/11"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/2/2campaign_stats.csv',\n",
" 'bdc2324-data/2/2campaigns.csv',\n",
" 'bdc2324-data/2/2categories.csv',\n",
" 'bdc2324-data/2/2contribution_sites.csv',\n",
" 'bdc2324-data/2/2contributions.csv',\n",
" 'bdc2324-data/2/2countries.csv',\n",
" 'bdc2324-data/2/2currencies.csv',\n",
" 'bdc2324-data/2/2customer_target_mappings.csv',\n",
" 'bdc2324-data/2/2customersplus.csv',\n",
" 'bdc2324-data/2/2event_types.csv',\n",
" 'bdc2324-data/2/2events.csv',\n",
" 'bdc2324-data/2/2facilities.csv',\n",
" 'bdc2324-data/2/2link_stats.csv',\n",
" 'bdc2324-data/2/2pricing_formulas.csv',\n",
" 'bdc2324-data/2/2product_packs.csv',\n",
" 'bdc2324-data/2/2products.csv',\n",
" 'bdc2324-data/2/2products_groups.csv',\n",
" 'bdc2324-data/2/2purchases.csv',\n",
" 'bdc2324-data/2/2representation_category_capacities.csv',\n",
" 'bdc2324-data/2/2representations.csv',\n",
" 'bdc2324-data/2/2seasons.csv',\n",
" 'bdc2324-data/2/2structure_tag_mappings.csv',\n",
" 'bdc2324-data/2/2suppliers.csv',\n",
" 'bdc2324-data/2/2tags.csv',\n",
" 'bdc2324-data/2/2target_types.csv',\n",
" 'bdc2324-data/2/2targets.csv',\n",
" 'bdc2324-data/2/2tickets.csv']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BUCKET = \"bdc2324-data/2\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
"metadata": {},
"source": [
"## Type de client au globale"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
" 'extra_field'],\n",
" dtype='object')\n",
"(124302, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 124302 entries, 0 to 124301\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 124302 non-null int64 \n",
" 1 customer_id 124302 non-null int64 \n",
" 2 target_id 124302 non-null int64 \n",
" 3 created_at 124296 non-null object \n",
" 4 updated_at 124296 non-null object \n",
" 5 name 0 non-null float64\n",
" 6 extra_field 0 non-null float64\n",
"dtypes: float64(2), int64(3), object(2)\n",
"memory usage: 6.6+ MB\n"
]
}
],
"source": [
"# Client\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d22aa131-5069-43d4-a42e-24f38cc7240d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
" 'extra_field'],\n",
" dtype='object')\n",
"(124302, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 124302 entries, 0 to 124301\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 124302 non-null int64 \n",
" 1 customer_id 124302 non-null int64 \n",
" 2 target_id 124302 non-null int64 \n",
" 3 created_at 124296 non-null object \n",
" 4 updated_at 124296 non-null object \n",
" 5 name 0 non-null float64\n",
" 6 extra_field 0 non-null float64\n",
"dtypes: float64(2), int64(3), object(2)\n",
"memory usage: 6.6+ MB\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "967b20e2-5a30-4724-989f-b9e39c7c67e7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>793889</td>\n",
" <td>344151</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>793890</td>\n",
" <td>344152</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>793891</td>\n",
" <td>344153</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>793892</td>\n",
" <td>344154</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>793893</td>\n",
" <td>344155</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124297</th>\n",
" <td>742001</td>\n",
" <td>329855</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124298</th>\n",
" <td>742002</td>\n",
" <td>329856</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124299</th>\n",
" <td>742000</td>\n",
" <td>329854</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124300</th>\n",
" <td>742003</td>\n",
" <td>329857</td>\n",
" <td>134</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124301</th>\n",
" <td>741996</td>\n",
" <td>329850</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>124302 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id target_id created_at \\\n",
"0 793889 344151 101 2022-09-29 17:55:41.083666+02:00 \n",
"1 793890 344152 101 2022-09-29 19:16:07.252114+02:00 \n",
"2 793891 344153 101 2022-09-29 19:55:10.443450+02:00 \n",
"3 793892 344154 101 2022-09-29 20:16:08.269407+02:00 \n",
"4 793893 344155 101 2022-09-29 21:03:40.541998+02:00 \n",
"... ... ... ... ... \n",
"124297 742001 329855 101 2022-07-11 18:17:09.607162+02:00 \n",
"124298 742002 329856 101 2022-07-11 18:44:45.636248+02:00 \n",
"124299 742000 329854 101 2022-07-11 17:46:48.914507+02:00 \n",
"124300 742003 329857 134 2022-07-11 18:44:55.915889+02:00 \n",
"124301 741996 329850 101 2022-07-11 16:52:37.227487+02:00 \n",
"\n",
" updated_at name extra_field \n",
"0 2022-09-29 17:55:41.083666+02:00 NaN NaN \n",
"1 2022-09-29 19:16:07.252114+02:00 NaN NaN \n",
"2 2022-09-29 19:55:10.443450+02:00 NaN NaN \n",
"3 2022-09-29 20:16:08.269407+02:00 NaN NaN \n",
"4 2022-09-29 21:03:40.541998+02:00 NaN NaN \n",
"... ... ... ... \n",
"124297 2022-07-11 18:17:09.607162+02:00 NaN NaN \n",
"124298 2022-07-11 18:44:45.636248+02:00 NaN NaN \n",
"124299 2022-07-11 17:46:48.914507+02:00 NaN NaN \n",
"124300 2022-07-11 18:44:55.915889+02:00 NaN NaN \n",
"124301 2022-07-11 16:52:37.227487+02:00 NaN NaN \n",
"\n",
"[124302 rows x 7 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings['extra_field'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "47bc8453-0693-4838-8bd8-4d800a82c496",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(4, 6)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 4 non-null int64 \n",
" 1 is_import 4 non-null bool \n",
" 2 name 4 non-null object\n",
" 3 created_at 4 non-null object\n",
" 4 updated_at 4 non-null object\n",
" 5 identifier 4 non-null object\n",
"dtypes: bool(1), int64(1), object(4)\n",
"memory usage: 292.0+ bytes\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11target_types.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(target_types.columns)\n",
"print(target_types.shape)\n",
"target_types.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_import</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>manual_structure</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>382bca214204a2d3462f5ec2728d5d1e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>manual_import</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_import name created_at \\\n",
"0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
"1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
"2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
"3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
"1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
"2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
"3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_types"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8dd74e87-97c2-493d-b19f-971b684078d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(20, 5)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 20 entries, 0 to 19\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 20 non-null int64 \n",
" 1 name 19 non-null object\n",
" 2 created_at 20 non-null object\n",
" 3 updated_at 20 non-null object\n",
" 4 identifier 20 non-null object\n",
"dtypes: int64(1), object(4)\n",
"memory usage: 928.0+ bytes\n"
]
}
],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" tags = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(tags.columns)\n",
"print(tags.shape)\n",
"tags.info()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "91d54732-666c-4250-ba91-5c9b83d4712a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>ens-écoles</td>\n",
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
" <td>b6a360c5f84595940c5774f13fd39cc3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>ecoles primaires rennes</td>\n",
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
" <td>ca8649dd64c240d118f60b07d11a7053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>Angers Nantes Opéra</td>\n",
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
" <td>f8f500f937fe312542399299cdc13f7e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>Opéras</td>\n",
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
" <td>22eb2c616983ec7b54a093f84b230505</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>Ministère de la Culture</td>\n",
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
" <td>1b8c5c08fde000d90905a3d14af7763d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8</td>\n",
" <td>Orchestres</td>\n",
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
" <td>7c2aee0c80642d7e325a450f2dec45e5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>9</td>\n",
" <td>Cooperative</td>\n",
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
" <td>6c88c36ffaab88d255865aa3111d7686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>10</td>\n",
" <td>Théâtres</td>\n",
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
" <td>b2c19672df82021702b79482c8cda85a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>11</td>\n",
" <td>La co[opera]tive</td>\n",
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
" <td>5dbaa3a1f278c0fcf981d447ad20957a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>12</td>\n",
" <td>Ville de Rennes</td>\n",
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
" <td>bc483d04d9c3a08f167a3ce64366ca72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>13</td>\n",
" <td>Ensembles en résidence</td>\n",
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
" <td>e70635e771de13268dccf02bb2abfaf9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>14</td>\n",
" <td>Ministère</td>\n",
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
" <td>a3f0582853fd19f5b57e3651f8a20e7a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>15</td>\n",
" <td>Rennes métropole</td>\n",
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
" <td>e98b8db5941b96c29c353b6f2f502055</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>16</td>\n",
" <td>Ville de Rennes - équipements culturels</td>\n",
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
" <td>a44edffc7edb852982efa7f4aa6d0e25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>17</td>\n",
" <td>Structures culturelles rennaises</td>\n",
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
" <td>241550517e4e3b1c926e9aeab0f621cd</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>18</td>\n",
" <td>Université Rennes 2</td>\n",
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
" <td>4057c5cee51c4e10aa819f0cf48adc3f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>19</td>\n",
" <td>Centres chorégraphiques nationaux</td>\n",
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
" <td>41e75941dfb766365498d917abe0102f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>20</td>\n",
" <td>Télévision</td>\n",
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
" <td>36d6409c539dd79c1f3af8c5948603eb</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>21</td>\n",
" <td>structures culturelles nationales</td>\n",
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
" <td>5311cf7e42aac53289e1c4a338d5cfa4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name \\\n",
"0 2 ens-écoles \n",
"1 1 NaN \n",
"2 4 ecoles primaires rennes \n",
"3 5 Angers Nantes Opéra \n",
"4 6 Opéras \n",
"5 7 Ministère de la Culture \n",
"6 8 Orchestres \n",
"7 9 Cooperative \n",
"8 10 Théâtres \n",
"9 11 La co[opera]tive \n",
"10 12 Ville de Rennes \n",
"11 13 Ensembles en résidence \n",
"12 14 Ministère \n",
"13 15 Rennes métropole \n",
"14 16 Ville de Rennes - équipements culturels \n",
"15 17 Structures culturelles rennaises \n",
"16 18 Université Rennes 2 \n",
"17 19 Centres chorégraphiques nationaux \n",
"18 20 Télévision \n",
"19 21 structures culturelles nationales \n",
"\n",
" created_at updated_at \\\n",
"0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n",
"1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n",
"2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n",
"3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n",
"4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n",
"5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n",
"6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n",
"7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n",
"8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n",
"9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n",
"10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n",
"11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n",
"12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n",
"13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n",
"14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n",
"15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n",
"16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n",
"17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n",
"18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n",
"19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n",
"\n",
" identifier \n",
"0 b6a360c5f84595940c5774f13fd39cc3 \n",
"1 d41d8cd98f00b204e9800998ecf8427e \n",
"2 ca8649dd64c240d118f60b07d11a7053 \n",
"3 f8f500f937fe312542399299cdc13f7e \n",
"4 22eb2c616983ec7b54a093f84b230505 \n",
"5 1b8c5c08fde000d90905a3d14af7763d \n",
"6 7c2aee0c80642d7e325a450f2dec45e5 \n",
"7 6c88c36ffaab88d255865aa3111d7686 \n",
"8 b2c19672df82021702b79482c8cda85a \n",
"9 5dbaa3a1f278c0fcf981d447ad20957a \n",
"10 bc483d04d9c3a08f167a3ce64366ca72 \n",
"11 e70635e771de13268dccf02bb2abfaf9 \n",
"12 a3f0582853fd19f5b57e3651f8a20e7a \n",
"13 e98b8db5941b96c29c353b6f2f502055 \n",
"14 a44edffc7edb852982efa7f4aa6d0e25 \n",
"15 241550517e4e3b1c926e9aeab0f621cd \n",
"16 4057c5cee51c4e10aa819f0cf48adc3f \n",
"17 41e75941dfb766365498d917abe0102f \n",
"18 36d6409c539dd79c1f3af8c5948603eb \n",
"19 5311cf7e42aac53289e1c4a338d5cfa4 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n",
"(179, 5)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 179 entries, 0 to 178\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 179 non-null int64 \n",
" 1 structure_id 179 non-null int64 \n",
" 2 tag_id 179 non-null int64 \n",
" 3 created_at 179 non-null object\n",
" 4 updated_at 179 non-null object\n",
"dtypes: int64(3), object(2)\n",
"memory usage: 7.1+ KB\n"
]
}
],
"source": [
"# Structure = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(structure_tag_mappings.columns)\n",
"print(structure_tag_mappings.shape)\n",
"structure_tag_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>structure_id</th>\n",
" <th>tag_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>123</td>\n",
" <td>187</td>\n",
" <td>6</td>\n",
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174</th>\n",
" <td>184</td>\n",
" <td>236</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>185</td>\n",
" <td>237</td>\n",
" <td>17</td>\n",
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>176</th>\n",
" <td>186</td>\n",
" <td>238</td>\n",
" <td>19</td>\n",
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>187</td>\n",
" <td>239</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>188</td>\n",
" <td>240</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>179 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" id structure_id tag_id created_at \\\n",
"0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n",
"1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n",
"2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n",
"3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n",
"4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n",
".. ... ... ... ... \n",
"174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n",
"175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n",
"176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n",
"177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n",
"178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n",
"\n",
" updated_at \n",
"0 2023-01-27 16:03:59.680222+01:00 \n",
"1 2021-05-07 15:24:19.872895+02:00 \n",
"2 2021-05-07 15:24:19.873830+02:00 \n",
"3 2021-05-07 15:24:19.874628+02:00 \n",
"4 2021-05-07 15:24:19.875421+02:00 \n",
".. ... \n",
"174 2023-02-17 16:35:25.041114+01:00 \n",
"175 2023-02-17 16:39:10.799478+01:00 \n",
"176 2023-02-17 16:53:21.098690+01:00 \n",
"177 2023-02-17 16:57:42.623481+01:00 \n",
"178 2023-02-17 16:59:22.067723+01:00 \n",
"\n",
"[179 rows x 5 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"structure_tag_mappings"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "41bf1529-5a7c-409e-9791-2024c08c11f0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
" 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
" 'tenant_id'],\n",
" dtype='object')\n",
"(71307, 43)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 71307 entries, 0 to 71306\n",
"Data columns (total 43 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 71307 non-null int64 \n",
" 1 lastname 41045 non-null object \n",
" 2 firstname 39140 non-null object \n",
" 3 birthdate 18174 non-null object \n",
" 4 email 58203 non-null object \n",
" 5 street_id 71307 non-null int64 \n",
" 6 created_at 71307 non-null object \n",
" 7 updated_at 71307 non-null object \n",
" 8 civility 0 non-null float64\n",
" 9 is_partner 71307 non-null bool \n",
" 10 extra 0 non-null float64\n",
" 11 deleted_at 0 non-null float64\n",
" 12 reference 0 non-null float64\n",
" 13 gender 71307 non-null int64 \n",
" 14 is_email_true 71307 non-null bool \n",
" 15 extra_field 0 non-null float64\n",
" 16 identifier 71307 non-null object \n",
" 17 opt_in 71307 non-null bool \n",
" 18 structure_id 616 non-null float64\n",
" 19 note 451 non-null object \n",
" 20 profession 812 non-null object \n",
" 21 language 0 non-null float64\n",
" 22 mcp_contact_id 22417 non-null float64\n",
" 23 need_reload 71307 non-null bool \n",
" 24 last_buying_date 34040 non-null object \n",
" 25 max_price 34040 non-null float64\n",
" 26 ticket_sum 71307 non-null int64 \n",
" 27 average_price 68694 non-null float64\n",
" 28 fidelity 71307 non-null int64 \n",
" 29 average_purchase_delay 34040 non-null float64\n",
" 30 average_price_basket 34040 non-null float64\n",
" 31 average_ticket_basket 34040 non-null float64\n",
" 32 total_price 36653 non-null float64\n",
" 33 preferred_category 0 non-null float64\n",
" 34 preferred_supplier 0 non-null float64\n",
" 35 preferred_formula 0 non-null float64\n",
" 36 purchase_count 71307 non-null int64 \n",
" 37 first_buying_date 34040 non-null object \n",
" 38 last_visiting_date 0 non-null float64\n",
" 39 zipcode 33756 non-null object \n",
" 40 country 39910 non-null object \n",
" 41 age 18174 non-null float64\n",
" 42 tenant_id 71307 non-null int64 \n",
"dtypes: bool(4), float64(19), int64(7), object(13)\n",
"memory usage: 21.5+ MB\n"
]
}
],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customersplus.columns)\n",
"print(customersplus.shape)\n",
"customersplus.info()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>...</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>286834</td>\n",
" <td>lastname286834</td>\n",
" <td>firstname286834</td>\n",
" <td>NaN</td>\n",
" <td>email286834</td>\n",
" <td>6</td>\n",
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>330695</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email330695</td>\n",
" <td>1</td>\n",
" <td>2022-07-16 04:10:34.135134+02:00</td>\n",
" <td>2022-07-16 04:10:34.156704+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>330978</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email330978</td>\n",
" <td>1</td>\n",
" <td>2022-07-21 22:14:09.811721+02:00</td>\n",
" <td>2022-07-21 22:14:09.836051+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>338697</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email338697</td>\n",
" <td>1</td>\n",
" <td>2022-09-15 19:02:03.950536+02:00</td>\n",
" <td>2022-09-15 19:02:03.985642+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>338726</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email338726</td>\n",
" <td>1</td>\n",
" <td>2022-09-16 01:24:40.719882+02:00</td>\n",
" <td>2022-09-16 01:24:40.742753+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71302</th>\n",
" <td>27105</td>\n",
" <td>lastname27105</td>\n",
" <td>firstname27105</td>\n",
" <td>1957-01-26</td>\n",
" <td>email27105</td>\n",
" <td>205024</td>\n",
" <td>2021-04-22 15:12:59.986534+02:00</td>\n",
" <td>2023-09-12 18:59:31.613235+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2018-12-31 18:56:57+01:00</td>\n",
" <td>NaN</td>\n",
" <td>35700</td>\n",
" <td>fr</td>\n",
" <td>66.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71303</th>\n",
" <td>27108</td>\n",
" <td>lastname27108</td>\n",
" <td>firstname27108</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>205024</td>\n",
" <td>2021-04-22 15:12:59.989197+02:00</td>\n",
" <td>2023-09-12 18:27:34.380843+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2015-12-29 14:51:46+01:00</td>\n",
" <td>NaN</td>\n",
" <td>35700</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71304</th>\n",
" <td>27110</td>\n",
" <td>lastname27110</td>\n",
" <td>firstname27110</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:12:59.991029+02:00</td>\n",
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2018-12-31 19:12:59+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71305</th>\n",
" <td>10607</td>\n",
" <td>lastname10607</td>\n",
" <td>firstname10607</td>\n",
" <td>1963-01-04</td>\n",
" <td>email10607</td>\n",
" <td>313332</td>\n",
" <td>2021-04-22 14:56:45.742226+02:00</td>\n",
" <td>2023-09-12 17:55:17.723195+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>26</td>\n",
" <td>2015-10-10 14:11:21+02:00</td>\n",
" <td>NaN</td>\n",
" <td>35850</td>\n",
" <td>fr</td>\n",
" <td>60.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71306</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>44.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>71307 rows × 43 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"0 286834 lastname286834 firstname286834 NaN email286834 \n",
"1 330695 NaN NaN NaN email330695 \n",
"2 330978 NaN NaN NaN email330978 \n",
"3 338697 NaN NaN NaN email338697 \n",
"4 338726 NaN NaN NaN email338726 \n",
"... ... ... ... ... ... \n",
"71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n",
"71303 27108 lastname27108 firstname27108 NaN NaN \n",
"71304 27110 lastname27110 firstname27110 NaN NaN \n",
"71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n",
"71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"\n",
" street_id created_at \\\n",
"0 6 2022-05-19 10:09:09.361137+02:00 \n",
"1 1 2022-07-16 04:10:34.135134+02:00 \n",
"2 1 2022-07-21 22:14:09.811721+02:00 \n",
"3 1 2022-09-15 19:02:03.950536+02:00 \n",
"4 1 2022-09-16 01:24:40.719882+02:00 \n",
"... ... ... \n",
"71302 205024 2021-04-22 15:12:59.986534+02:00 \n",
"71303 205024 2021-04-22 15:12:59.989197+02:00 \n",
"71304 6 2021-04-22 15:12:59.991029+02:00 \n",
"71305 313332 2021-04-22 14:56:45.742226+02:00 \n",
"71306 6 2021-04-22 15:06:30.120537+02:00 \n",
"\n",
" updated_at civility is_partner ... \\\n",
"0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n",
"1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n",
"2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n",
"3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n",
"4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n",
"... ... ... ... ... \n",
"71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n",
"71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n",
"71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
"71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n",
"71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"\n",
" preferred_category preferred_supplier preferred_formula \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"71302 NaN NaN NaN \n",
"71303 NaN NaN NaN \n",
"71304 NaN NaN NaN \n",
"71305 NaN NaN NaN \n",
"71306 NaN NaN NaN \n",
"\n",
" purchase_count first_buying_date last_visiting_date zipcode \\\n",
"0 0 NaN NaN NaN \n",
"1 0 NaN NaN NaN \n",
"2 0 NaN NaN NaN \n",
"3 0 NaN NaN NaN \n",
"4 0 NaN NaN NaN \n",
"... ... ... ... ... \n",
"71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n",
"71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n",
"71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n",
"71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n",
"71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n",
"\n",
" country age tenant_id \n",
"0 fr NaN 1556 \n",
"1 NaN NaN 1556 \n",
"2 NaN NaN 1556 \n",
"3 NaN NaN 1556 \n",
"4 NaN NaN 1556 \n",
"... ... ... ... \n",
"71302 fr 66.0 1556 \n",
"71303 fr NaN 1556 \n",
"71304 fr NaN 1556 \n",
"71305 fr 60.0 1556 \n",
"71306 fr 44.0 1556 \n",
"\n",
"[71307 rows x 43 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customersplus"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
"metadata": {},
"outputs": [],
"source": [
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "8259ae6c-353f-43a6-add3-f974fac6e5d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n",
" 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n",
" 'identifier'],\n",
" dtype='object')\n",
"(318969, 11)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 318969 entries, 0 to 318968\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 318969 non-null int64 \n",
" 1 number 318969 non-null object \n",
" 2 created_at 318969 non-null object \n",
" 3 updated_at 318969 non-null object \n",
" 4 purchase_id 318969 non-null int64 \n",
" 5 product_id 318969 non-null int64 \n",
" 6 is_from_subscription 318969 non-null bool \n",
" 7 type_of 318969 non-null int64 \n",
" 8 supplier_id 318969 non-null int64 \n",
" 9 barcode 0 non-null float64\n",
" 10 identifier 318969 non-null object \n",
"dtypes: bool(1), float64(1), int64(5), object(4)\n",
"memory usage: 24.6+ MB\n"
]
}
],
"source": [
"# tickets\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" tickets = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(tickets.columns)\n",
"print(tickets.shape)\n",
"tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "f54830cb-1f95-4f71-9b04-358c745fb454",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>number</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>purchase_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>type_of</th>\n",
" <th>supplier_id</th>\n",
" <th>barcode</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2119081</td>\n",
" <td>1433_136_212_68356</td>\n",
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
" <td>861764</td>\n",
" <td>209879</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>f694c255855ce5643c6fcc7fed5e9237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2119082</td>\n",
" <td>1433_136_194_68356</td>\n",
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
" <td>861763</td>\n",
" <td>209879</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>838d6101db2fc8bc80536d8b91b49859</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2119083</td>\n",
" <td>33158_158_343_68357</td>\n",
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
" <td>861769</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>8a8d938d66a4dc57bcb44c2773c6fdfa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2119084</td>\n",
" <td>33158_158_297_68357</td>\n",
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
" <td>861767</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>b7a3dd0794c0957c942d45b8913e5b96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2119085</td>\n",
" <td>33158_158_318_68357</td>\n",
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
" <td>861768</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>d7ea7e443581ebe520dd13f6cad31af7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>2564021</td>\n",
" <td>44247_204_239_89278</td>\n",
" <td>2023-09-12 18:59:48.750953+02:00</td>\n",
" <td>2023-09-12 18:59:48.750953+02:00</td>\n",
" <td>1244281</td>\n",
" <td>210158</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>82c9af8b2167f7ac34a5e834242b0239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>2564022</td>\n",
" <td>44247_204_299_89278</td>\n",
" <td>2023-09-12 18:59:48.751441+02:00</td>\n",
" <td>2023-09-12 18:59:48.751441+02:00</td>\n",
" <td>1244284</td>\n",
" <td>210158</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>235e8e608f066cb72949bbd397d0a76f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>2564023</td>\n",
" <td>44247_204_259_89278</td>\n",
" <td>2023-09-12 18:59:48.751924+02:00</td>\n",
" <td>2023-09-12 18:59:48.751924+02:00</td>\n",
" <td>1244282</td>\n",
" <td>210158</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>ec22fa828931f030f7e79a4cc5478c4b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>2564024</td>\n",
" <td>44247_204_279_89278</td>\n",
" <td>2023-09-12 18:59:48.752425+02:00</td>\n",
" <td>2023-09-12 18:59:48.752425+02:00</td>\n",
" <td>1244283</td>\n",
" <td>210158</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>31ec4deaf718e04caf193e1ff8d621ef</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>2513156</td>\n",
" <td>4854_178_2847_89170</td>\n",
" <td>2023-09-12 18:52:20.331807+02:00</td>\n",
" <td>2023-09-12 18:59:48.752904+02:00</td>\n",
" <td>1244285</td>\n",
" <td>261922</td>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>48aef9efab29bfb1537656908863bcc1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" id number created_at \\\n",
"0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n",
"1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n",
"2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n",
"3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n",
"4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n",
"... ... ... ... \n",
"318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n",
"318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n",
"318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n",
"318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n",
"318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n",
"\n",
" updated_at purchase_id product_id \\\n",
"0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n",
"1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n",
"2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n",
"3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n",
"4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n",
"... ... ... ... \n",
"318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n",
"318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n",
"318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n",
"318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n",
"318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n",
"\n",
" is_from_subscription type_of supplier_id barcode \\\n",
"0 False 1 1702 NaN \n",
"1 False 1 1702 NaN \n",
"2 False 1 1702 NaN \n",
"3 False 1 1702 NaN \n",
"4 False 1 1702 NaN \n",
"... ... ... ... ... \n",
"318964 False 1 1702 NaN \n",
"318965 False 1 1702 NaN \n",
"318966 False 1 1702 NaN \n",
"318967 False 1 1702 NaN \n",
"318968 False 3 1702 NaN \n",
"\n",
" identifier \n",
"0 f694c255855ce5643c6fcc7fed5e9237 \n",
"1 838d6101db2fc8bc80536d8b91b49859 \n",
"2 8a8d938d66a4dc57bcb44c2773c6fdfa \n",
"3 b7a3dd0794c0957c942d45b8913e5b96 \n",
"4 d7ea7e443581ebe520dd13f6cad31af7 \n",
"... ... \n",
"318964 82c9af8b2167f7ac34a5e834242b0239 \n",
"318965 235e8e608f066cb72949bbd397d0a76f \n",
"318966 ec22fa828931f030f7e79a4cc5478c4b \n",
"318967 31ec4deaf718e04caf193e1ff8d621ef \n",
"318968 48aef9efab29bfb1537656908863bcc1 \n",
"\n",
"[318969 rows x 11 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickets"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ad743347-33d1-41f0-852d-f9e6354f82ed",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 3, 0])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickets['type_of'].unique()"
]
},
{
"cell_type": "markdown",
"id": "b88808fe-3b4e-49ed-9885-d52910b6f211",
"metadata": {},
"source": [
"## Types d'évenement et client"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n",
" 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n",
" 'facility_key_id', 'identifier'],\n",
" dtype='object')\n",
"(403, 12)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 403 entries, 0 to 402\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 403 non-null int64 \n",
" 1 created_at 403 non-null object\n",
" 2 updated_at 403 non-null object\n",
" 3 season_id 403 non-null int64 \n",
" 4 facility_id 403 non-null int64 \n",
" 5 name 403 non-null object\n",
" 6 event_type_id 403 non-null int64 \n",
" 7 manual_added 403 non-null bool \n",
" 8 is_display 403 non-null bool \n",
" 9 event_type_key_id 403 non-null int64 \n",
" 10 facility_key_id 403 non-null int64 \n",
" 11 identifier 403 non-null object\n",
"dtypes: bool(2), int64(6), object(4)\n",
"memory usage: 32.4+ KB\n"
]
}
],
"source": [
"# Evenement = events.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" events = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(events.columns)\n",
"print(events.shape)\n",
"events.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "19706610-9e90-4e6f-8bd0-da124b87cff7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>name</th>\n",
" <th>event_type_id</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" <th>event_type_key_id</th>\n",
" <th>facility_key_id</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.218728+02:00</td>\n",
" <td>2023-09-13 03:54:30.103943+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>dialogues</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>60356fc5e8ed6c9c1be9c5ec67e77766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20570</td>\n",
" <td>2023-10-05 04:48:29.374504+02:00</td>\n",
" <td>2023-10-05 04:48:36.562528+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>les grandes epopees</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>f8ab088e06252bf34e1b12ad2ce1a403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20757</td>\n",
" <td>2023-11-01 03:55:20.846196+01:00</td>\n",
" <td>2023-11-01 03:55:28.412457+01:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>scolaire marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>447fa80f9a793b7587bb85ebbda6442c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20364</td>\n",
" <td>2023-09-13 03:42:45.196791+02:00</td>\n",
" <td>2023-09-13 03:54:30.075456+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>le couronnement de poppee</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>3b37f5d2cd354cbc422868621ac7ebc2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>398</th>\n",
" <td>15603</td>\n",
" <td>2023-09-12 17:42:25.327618+02:00</td>\n",
" <td>2023-09-12 19:00:00.893400+02:00</td>\n",
" <td>1706</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>fde88b72fb82b1fe42fbbfbfc3d6b4d3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>399</th>\n",
" <td>15621</td>\n",
" <td>2023-09-12 17:42:25.335792+02:00</td>\n",
" <td>2023-09-12 19:00:00.899622+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>cartes d'adhesion</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>051b96aad2b720bad4450a59ed7dfbf6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>15740</td>\n",
" <td>2023-09-12 17:47:05.112101+02:00</td>\n",
" <td>2023-09-12 19:00:00.906123+02:00</td>\n",
" <td>1711</td>\n",
" <td>1054</td>\n",
" <td>repetition le medecin malgre lui</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>addd6885bea5ddf60ec3539dfc3e79e8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>401</th>\n",
" <td>15520</td>\n",
" <td>2023-09-12 17:42:25.290280+02:00</td>\n",
" <td>2023-09-12 19:00:00.835625+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>opera au village</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>94f250d10d4a56358ceab23b384439ff</td>\n",
" </tr>\n",
" <tr>\n",
" <th>402</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>1054</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>403 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" id created_at \\\n",
"0 20367 2023-09-13 03:42:45.214293+02:00 \n",
"1 20371 2023-09-13 03:42:45.218728+02:00 \n",
"2 20570 2023-10-05 04:48:29.374504+02:00 \n",
"3 20757 2023-11-01 03:55:20.846196+01:00 \n",
"4 20364 2023-09-13 03:42:45.196791+02:00 \n",
".. ... ... \n",
"398 15603 2023-09-12 17:42:25.327618+02:00 \n",
"399 15621 2023-09-12 17:42:25.335792+02:00 \n",
"400 15740 2023-09-12 17:47:05.112101+02:00 \n",
"401 15520 2023-09-12 17:42:25.290280+02:00 \n",
"402 15439 2023-09-12 17:42:25.252747+02:00 \n",
"\n",
" updated_at season_id facility_id \\\n",
"0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n",
"1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n",
"2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n",
"3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n",
"4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n",
".. ... ... ... \n",
"398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n",
"399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n",
"400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n",
"401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n",
"402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n",
"\n",
" name event_type_id manual_added \\\n",
"0 marelle 1055 False \n",
"1 dialogues 1055 False \n",
"2 les grandes epopees 1055 False \n",
"3 scolaire marelle 1055 False \n",
"4 le couronnement de poppee 1055 False \n",
".. ... ... ... \n",
"398 marelle 1055 False \n",
"399 cartes d'adhesion 1055 False \n",
"400 repetition le medecin malgre lui 1055 False \n",
"401 opera au village 1055 False \n",
"402 florilege 1055 False \n",
"\n",
" is_display event_type_key_id facility_key_id \\\n",
"0 True 1055 1054 \n",
"1 True 1055 1054 \n",
"2 True 1055 1054 \n",
"3 True 1055 1054 \n",
"4 True 1055 1054 \n",
".. ... ... ... \n",
"398 True 1055 1054 \n",
"399 True 1055 1054 \n",
"400 True 1055 1054 \n",
"401 True 1055 1054 \n",
"402 True 1055 1054 \n",
"\n",
" identifier \n",
"0 26d1e9a4acad18b9cf79244334c86c93 \n",
"1 60356fc5e8ed6c9c1be9c5ec67e77766 \n",
"2 f8ab088e06252bf34e1b12ad2ce1a403 \n",
"3 447fa80f9a793b7587bb85ebbda6442c \n",
"4 3b37f5d2cd354cbc422868621ac7ebc2 \n",
".. ... \n",
"398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n",
"399 051b96aad2b720bad4450a59ed7dfbf6 \n",
"400 addd6885bea5ddf60ec3539dfc3e79e8 \n",
"401 94f250d10d4a56358ceab23b384439ff \n",
"402 4f015946bcbd856aa573cadb7ac42b9f \n",
"\n",
"[403 rows x 12 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6cb04679-26e7-4ed8-bfc1-42285da96374",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"357"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"events['name'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n",
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
" 'is_display', 'representation_type_id', 'expected_filling',\n",
" 'max_filling', 'extra_field', 'identifier'],\n",
" dtype='object')\n",
"(996, 16)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 996 entries, 0 to 995\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 996 non-null int64 \n",
" 1 serial 0 non-null float64\n",
" 2 event_id 996 non-null int64 \n",
" 3 created_at 996 non-null object \n",
" 4 updated_at 996 non-null object \n",
" 5 start_date_time 996 non-null object \n",
" 6 open 996 non-null bool \n",
" 7 satisfaction 0 non-null float64\n",
" 8 end_date_time 996 non-null object \n",
" 9 name 0 non-null float64\n",
" 10 is_display 996 non-null bool \n",
" 11 representation_type_id 0 non-null float64\n",
" 12 expected_filling 24 non-null float64\n",
" 13 max_filling 24 non-null float64\n",
" 14 extra_field 0 non-null float64\n",
" 15 identifier 996 non-null object \n",
"dtypes: bool(2), float64(7), int64(2), object(5)\n",
"memory usage: 111.0+ KB\n"
]
}
],
"source": [
"# Représentation des évenements = representations.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" representations = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(representations.columns)\n",
"print(representations.shape)\n",
"representations.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>serial</th>\n",
" <th>event_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>start_date_time</th>\n",
" <th>open</th>\n",
" <th>satisfaction</th>\n",
" <th>end_date_time</th>\n",
" <th>name</th>\n",
" <th>is_display</th>\n",
" <th>representation_type_id</th>\n",
" <th>expected_filling</th>\n",
" <th>max_filling</th>\n",
" <th>extra_field</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>550.0</td>\n",
" <td>550.0</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>45497</td>\n",
" <td>NaN</td>\n",
" <td>20757</td>\n",
" <td>2023-11-01 03:55:20.875712+01:00</td>\n",
" <td>2023-11-01 03:55:20.875712+01:00</td>\n",
" <td>2023-11-28 10:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5c34b84e3d11276e0995d984c94cd28d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>44383</td>\n",
" <td>NaN</td>\n",
" <td>20383</td>\n",
" <td>2023-09-13 10:41:08.964302+02:00</td>\n",
" <td>2023-09-13 10:41:08.964302+02:00</td>\n",
" <td>2023-06-04 17:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>bf3c65a1dfefbd747dcc2360e6887eac</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>44384</td>\n",
" <td>NaN</td>\n",
" <td>20383</td>\n",
" <td>2023-09-13 10:41:08.972401+02:00</td>\n",
" <td>2023-09-13 10:41:08.972401+02:00</td>\n",
" <td>2023-06-03 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>b0e69ae8b78ebab3066aac83de22d239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>44385</td>\n",
" <td>NaN</td>\n",
" <td>20384</td>\n",
" <td>2023-09-13 10:41:08.973290+02:00</td>\n",
" <td>2023-09-13 10:41:08.973290+02:00</td>\n",
" <td>2023-06-03 16:15:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9fb91c8b1cf9e444111c511e212ac5c1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>991</th>\n",
" <td>33894</td>\n",
" <td>NaN</td>\n",
" <td>15647</td>\n",
" <td>2023-09-12 17:42:25.564297+02:00</td>\n",
" <td>2023-09-12 17:42:25.564297+02:00</td>\n",
" <td>2022-11-08 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>44bbcecfd007ceaad05805391beccabb</td>\n",
" </tr>\n",
" <tr>\n",
" <th>992</th>\n",
" <td>33873</td>\n",
" <td>NaN</td>\n",
" <td>15640</td>\n",
" <td>2023-09-12 17:42:25.554863+02:00</td>\n",
" <td>2023-09-12 17:42:25.554863+02:00</td>\n",
" <td>2022-11-14 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>151edbec8e0a3cd80071038e857f3493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>993</th>\n",
" <td>33610</td>\n",
" <td>NaN</td>\n",
" <td>15520</td>\n",
" <td>2023-09-12 17:42:25.442979+02:00</td>\n",
" <td>2023-09-12 17:42:25.442979+02:00</td>\n",
" <td>2023-06-19 18:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9e9e38d527427e1b6f67e0c3f12b82fc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>994</th>\n",
" <td>33953</td>\n",
" <td>NaN</td>\n",
" <td>15520</td>\n",
" <td>2023-09-12 17:42:25.590746+02:00</td>\n",
" <td>2023-09-12 17:42:25.590746+02:00</td>\n",
" <td>2023-06-19 20:00:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7bf0978aabb6cac1bb4cd2784afb2b6b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>995</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>996 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" id serial event_id created_at \\\n",
"0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n",
"2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n",
"3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n",
"4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n",
".. ... ... ... ... \n",
"991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n",
"992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n",
"993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n",
"994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n",
"995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"\n",
" updated_at start_date_time open \\\n",
"0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n",
"2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n",
"3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n",
"4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n",
".. ... ... ... \n",
"991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n",
"992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n",
"993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n",
"994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n",
"995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"\n",
" satisfaction end_date_time name is_display \\\n",
"0 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"1 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"2 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"3 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"4 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
".. ... ... ... ... \n",
"991 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"992 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"993 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"994 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"995 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
"\n",
" representation_type_id expected_filling max_filling extra_field \\\n",
"0 NaN 550.0 550.0 NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
".. ... ... ... ... \n",
"991 NaN NaN NaN NaN \n",
"992 NaN NaN NaN NaN \n",
"993 NaN NaN NaN NaN \n",
"994 NaN NaN NaN NaN \n",
"995 NaN NaN NaN NaN \n",
"\n",
" identifier \n",
"0 33520762e8cc28982e3841cbc2be8ce2 \n",
"1 5c34b84e3d11276e0995d984c94cd28d \n",
"2 bf3c65a1dfefbd747dcc2360e6887eac \n",
"3 b0e69ae8b78ebab3066aac83de22d239 \n",
"4 9fb91c8b1cf9e444111c511e212ac5c1 \n",
".. ... \n",
"991 44bbcecfd007ceaad05805391beccabb \n",
"992 151edbec8e0a3cd80071038e857f3493 \n",
"993 9e9e38d527427e1b6f67e0c3f12b82fc \n",
"994 7bf0978aabb6cac1bb4cd2784afb2b6b \n",
"995 fae68f1e09710ec8747957af6e22f61d \n",
"\n",
"[996 rows x 16 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"representations"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'amount', 'is_full_price', 'representation_id',\n",
" 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n",
" 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n",
" 'amount_consumption', 'identifier'],\n",
" dtype='object')\n",
"(14648, 14)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 14648 entries, 0 to 14647\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 14648 non-null int64 \n",
" 1 amount 14648 non-null float64\n",
" 2 is_full_price 14648 non-null bool \n",
" 3 representation_id 14648 non-null int64 \n",
" 4 pricing_formula_id 14648 non-null int64 \n",
" 5 created_at 14648 non-null object \n",
" 6 updated_at 14648 non-null object \n",
" 7 category_id 14648 non-null int64 \n",
" 8 apply_price 14648 non-null float64\n",
" 9 products_group_id 14648 non-null int64 \n",
" 10 product_pack_id 14648 non-null int64 \n",
" 11 extra_field 0 non-null float64\n",
" 12 amount_consumption 0 non-null float64\n",
" 13 identifier 14648 non-null object \n",
"dtypes: bool(1), float64(4), int64(6), object(3)\n",
"memory usage: 1.5+ MB\n"
]
}
],
"source": [
"# Produits vendues = products.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" products = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(products.columns)\n",
"print(products.shape)\n",
"products.info()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "34f1825d-148a-4a6e-88d6-61449fee3ee4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>representation_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>category_id</th>\n",
" <th>apply_price</th>\n",
" <th>products_group_id</th>\n",
" <th>product_pack_id</th>\n",
" <th>extra_field</th>\n",
" <th>amount_consumption</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>268325</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" <td>44332</td>\n",
" <td>20477</td>\n",
" <td>2023-09-13 03:42:45.415594+02:00</td>\n",
" <td>2023-09-13 03:42:45.415594+02:00</td>\n",
" <td>4972</td>\n",
" <td>0.0</td>\n",
" <td>268108</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>b823bbea3ba837da2ef8efaf1287272d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>274118</td>\n",
" <td>36.8</td>\n",
" <td>False</td>\n",
" <td>44340</td>\n",
" <td>20502</td>\n",
" <td>2023-10-25 03:26:57.430694+02:00</td>\n",
" <td>2023-10-25 03:26:57.430694+02:00</td>\n",
" <td>4969</td>\n",
" <td>0.0</td>\n",
" <td>273901</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>81e8b7991f6948e3ef7cfe5011d13532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>268338</td>\n",
" <td>39.1</td>\n",
" <td>False</td>\n",
" <td>44340</td>\n",
" <td>20497</td>\n",
" <td>2023-09-13 03:42:45.430942+02:00</td>\n",
" <td>2023-09-13 03:42:45.430942+02:00</td>\n",
" <td>4969</td>\n",
" <td>0.0</td>\n",
" <td>268121</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>be8bc0399db4d04aefa9f44afd4d5efa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>209883</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>33443</td>\n",
" <td>20475</td>\n",
" <td>2023-09-12 17:42:27.595998+02:00</td>\n",
" <td>2023-09-12 17:42:27.595998+02:00</td>\n",
" <td>4970</td>\n",
" <td>0.0</td>\n",
" <td>209706</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>01a9eea5f8ad53491faa864bfac44183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>268326</td>\n",
" <td>63.0</td>\n",
" <td>False</td>\n",
" <td>44333</td>\n",
" <td>20477</td>\n",
" <td>2023-09-13 03:42:45.417283+02:00</td>\n",
" <td>2023-09-13 03:42:45.417283+02:00</td>\n",
" <td>4969</td>\n",
" <td>0.0</td>\n",
" <td>268109</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>781a917ecfdabb14169701d7b143bbe4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14643</th>\n",
" <td>217878</td>\n",
" <td>33.6</td>\n",
" <td>False</td>\n",
" <td>33919</td>\n",
" <td>20489</td>\n",
" <td>2023-09-12 17:51:11.572882+02:00</td>\n",
" <td>2023-09-12 17:51:11.572882+02:00</td>\n",
" <td>4971</td>\n",
" <td>0.0</td>\n",
" <td>217695</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82bba69321466069411b3023343b44a4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14644</th>\n",
" <td>268315</td>\n",
" <td>10.0</td>\n",
" <td>False</td>\n",
" <td>33919</td>\n",
" <td>20504</td>\n",
" <td>2023-09-12 18:59:29.995176+02:00</td>\n",
" <td>2023-09-12 18:59:29.995176+02:00</td>\n",
" <td>4969</td>\n",
" <td>0.0</td>\n",
" <td>268098</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>eae56a8eb0a4315c5713b2053103d595</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14645</th>\n",
" <td>210148</td>\n",
" <td>5.0</td>\n",
" <td>False</td>\n",
" <td>33531</td>\n",
" <td>20473</td>\n",
" <td>2023-09-12 17:42:27.733260+02:00</td>\n",
" <td>2023-09-12 17:42:27.733260+02:00</td>\n",
" <td>4975</td>\n",
" <td>0.0</td>\n",
" <td>209971</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>449f86c1ef2b478d3389f7d0e27d0e6b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14646</th>\n",
" <td>212054</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" <td>33810</td>\n",
" <td>20473</td>\n",
" <td>2023-09-12 17:42:28.724681+02:00</td>\n",
" <td>2023-09-12 17:42:28.724681+02:00</td>\n",
" <td>4972</td>\n",
" <td>0.0</td>\n",
" <td>211876</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2090203e2c0b58ea8f505089faee6d62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14647</th>\n",
" <td>261922</td>\n",
" <td>21.0</td>\n",
" <td>False</td>\n",
" <td>33766</td>\n",
" <td>20488</td>\n",
" <td>2023-09-12 18:52:00.519838+02:00</td>\n",
" <td>2023-09-12 18:52:00.519838+02:00</td>\n",
" <td>4972</td>\n",
" <td>0.0</td>\n",
" <td>261709</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9139ee36a92bed766ae95372cca77336</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>14648 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" id amount is_full_price representation_id pricing_formula_id \\\n",
"0 268325 18.0 False 44332 20477 \n",
"1 274118 36.8 False 44340 20502 \n",
"2 268338 39.1 False 44340 20497 \n",
"3 209883 0.0 False 33443 20475 \n",
"4 268326 63.0 False 44333 20477 \n",
"... ... ... ... ... ... \n",
"14643 217878 33.6 False 33919 20489 \n",
"14644 268315 10.0 False 33919 20504 \n",
"14645 210148 5.0 False 33531 20473 \n",
"14646 212054 30.0 False 33810 20473 \n",
"14647 261922 21.0 False 33766 20488 \n",
"\n",
" created_at updated_at \\\n",
"0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n",
"1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n",
"2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n",
"3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n",
"4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n",
"... ... ... \n",
"14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n",
"14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n",
"14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n",
"14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n",
"14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n",
"\n",
" category_id apply_price products_group_id product_pack_id \\\n",
"0 4972 0.0 268108 1 \n",
"1 4969 0.0 273901 1 \n",
"2 4969 0.0 268121 1 \n",
"3 4970 0.0 209706 1 \n",
"4 4969 0.0 268109 1 \n",
"... ... ... ... ... \n",
"14643 4971 0.0 217695 1 \n",
"14644 4969 0.0 268098 1 \n",
"14645 4975 0.0 209971 1 \n",
"14646 4972 0.0 211876 1 \n",
"14647 4972 0.0 261709 1 \n",
"\n",
" extra_field amount_consumption identifier \n",
"0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n",
"1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n",
"2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n",
"3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n",
"4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n",
"... ... ... ... \n",
"14643 NaN NaN 82bba69321466069411b3023343b44a4 \n",
"14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n",
"14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n",
"14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n",
"14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n",
"\n",
"[14648 rows x 14 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "6735b338-26b5-479d-825d-677ea533dad5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n",
" 'identifier'],\n",
" dtype='object')\n",
"(1, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1 entries, 0 to 0\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 1 non-null int64 \n",
" 1 name 0 non-null float64\n",
" 2 created_at 1 non-null object \n",
" 3 updated_at 1 non-null object \n",
" 4 street_id 1 non-null int64 \n",
" 5 fixed_capacity 0 non-null float64\n",
" 6 identifier 1 non-null object \n",
"dtypes: float64(2), int64(2), object(3)\n",
"memory usage: 184.0+ bytes\n"
]
}
],
"source": [
"# Lieu = facilities.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" facilities = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(facilities.columns)\n",
"print(facilities.shape)\n",
"facilities.info()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>street_id</th>\n",
" <th>fixed_capacity</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1054</td>\n",
" <td>NaN</td>\n",
" <td>2023-09-12 17:42:25.223064+02:00</td>\n",
" <td>2023-09-12 17:42:25.223064+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n",
"\n",
" updated_at street_id fixed_capacity \\\n",
"0 2023-09-12 17:42:25.223064+02:00 1 NaN \n",
"\n",
" identifier \n",
"0 d41d8cd98f00b204e9800998ecf8427e "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"facilities"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n",
" 'identifier'],\n",
" dtype='object')\n",
"(9, 6)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 9 entries, 0 to 8\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 9 non-null int64 \n",
" 1 name 9 non-null object \n",
" 2 created_at 9 non-null object \n",
" 3 updated_at 9 non-null object \n",
" 4 start_date_time 0 non-null float64\n",
" 5 identifier 9 non-null object \n",
"dtypes: float64(1), int64(1), object(4)\n",
"memory usage: 560.0+ bytes\n"
]
}
],
"source": [
"# Saisons = seasons.csv période sur deux années consécutives\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" seasons = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(seasons.columns)\n",
"print(seasons.shape)\n",
"seasons.info()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n",
" 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n",
" 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n",
" dtype=object)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seasons['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n",
" 'number', 'identifier'],\n",
" dtype='object')\n",
"(410695, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 410695 entries, 0 to 410694\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 410695 non-null int64 \n",
" 1 purchase_date 410695 non-null object \n",
" 2 customer_id 410695 non-null int64 \n",
" 3 created_at 410695 non-null object \n",
" 4 updated_at 410695 non-null object \n",
" 5 number 0 non-null float64\n",
" 6 identifier 410695 non-null object \n",
"dtypes: float64(1), int64(2), object(4)\n",
"memory usage: 21.9+ MB\n"
]
}
],
"source": [
"# Achats = purchases.csv \n",
"FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" purchases = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(purchases.columns)\n",
"print(purchases.shape)\n",
"purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>number</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>861761</td>\n",
" <td>2019-03-01 16:28:49+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.564150+02:00</td>\n",
" <td>2023-09-12 17:42:37.564150+02:00</td>\n",
" <td>NaN</td>\n",
" <td>d20eb0c3a7efec0bbe338dee40dc3378</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>861762</td>\n",
" <td>2019-03-01 16:29:11+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.571159+02:00</td>\n",
" <td>2023-09-12 17:42:37.571159+02:00</td>\n",
" <td>NaN</td>\n",
" <td>cff3abfc018517bce5ccfc58f5cacf40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>861763</td>\n",
" <td>2019-03-01 16:29:17+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e1155cf26b34f792bdb23e49244d7264</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>861764</td>\n",
" <td>2019-03-01 16:29:19+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e8b95cc6a1a8b103ffa39755ce3bfc4d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>861765</td>\n",
" <td>2019-03-01 16:32:08+01:00</td>\n",
" <td>405994</td>\n",
" <td>2023-09-12 17:42:37.572470+02:00</td>\n",
" <td>2023-09-12 17:42:37.572470+02:00</td>\n",
" <td>NaN</td>\n",
" <td>1b763278914f1309e357abe5033a3f0f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410690</th>\n",
" <td>1285964</td>\n",
" <td>2023-10-21 21:46:41+02:00</td>\n",
" <td>517309</td>\n",
" <td>2023-10-23 03:43:16.457501+02:00</td>\n",
" <td>2023-10-23 03:43:16.457501+02:00</td>\n",
" <td>NaN</td>\n",
" <td>72c4e90c2b151dcffc87b19ea8a0c4f1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410691</th>\n",
" <td>1285965</td>\n",
" <td>2023-10-21 21:47:07+02:00</td>\n",
" <td>517309</td>\n",
" <td>2023-10-23 03:43:16.458458+02:00</td>\n",
" <td>2023-10-23 03:43:16.458458+02:00</td>\n",
" <td>NaN</td>\n",
" <td>ee65532087132145daa6154fbae050ea</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410692</th>\n",
" <td>1285966</td>\n",
" <td>2023-10-21 21:47:20+02:00</td>\n",
" <td>517309</td>\n",
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
" <td>NaN</td>\n",
" <td>7e825dd352bc6a11ab81cb8068e325e6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410693</th>\n",
" <td>1285967</td>\n",
" <td>2023-10-21 23:07:06+02:00</td>\n",
" <td>399969</td>\n",
" <td>2023-10-23 03:43:16.459738+02:00</td>\n",
" <td>2023-10-23 03:43:16.459738+02:00</td>\n",
" <td>NaN</td>\n",
" <td>fdb92627a48d6ba8fa817d60a83dbea8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410694</th>\n",
" <td>1285968</td>\n",
" <td>2023-10-21 23:07:39+02:00</td>\n",
" <td>399969</td>\n",
" <td>2023-10-23 03:43:16.462409+02:00</td>\n",
" <td>2023-10-23 03:43:16.462409+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e9dbaff4f7037a5b0efa11263584dfad</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>410695 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id purchase_date customer_id \\\n",
"0 861761 2019-03-01 16:28:49+01:00 4966 \n",
"1 861762 2019-03-01 16:29:11+01:00 4966 \n",
"2 861763 2019-03-01 16:29:17+01:00 4966 \n",
"3 861764 2019-03-01 16:29:19+01:00 4966 \n",
"4 861765 2019-03-01 16:32:08+01:00 405994 \n",
"... ... ... ... \n",
"410690 1285964 2023-10-21 21:46:41+02:00 517309 \n",
"410691 1285965 2023-10-21 21:47:07+02:00 517309 \n",
"410692 1285966 2023-10-21 21:47:20+02:00 517309 \n",
"410693 1285967 2023-10-21 23:07:06+02:00 399969 \n",
"410694 1285968 2023-10-21 23:07:39+02:00 399969 \n",
"\n",
" created_at updated_at \\\n",
"0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n",
"1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n",
"2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n",
"3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n",
"4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n",
"... ... ... \n",
"410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n",
"410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n",
"410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n",
"410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n",
"410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n",
"\n",
" number identifier \n",
"0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n",
"1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n",
"2 NaN e1155cf26b34f792bdb23e49244d7264 \n",
"3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n",
"4 NaN 1b763278914f1309e357abe5033a3f0f \n",
"... ... ... \n",
"410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n",
"410691 NaN ee65532087132145daa6154fbae050ea \n",
"410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n",
"410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n",
"410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n",
"\n",
"[410695 rows x 7 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"purchases"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "281c48da-e1a0-4298-b2e6-81f9fc6461aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" <th>created_at_x</th>\n",
" <th>updated_at_x</th>\n",
" <th>number_x</th>\n",
" <th>identifier_x</th>\n",
" <th>id_y</th>\n",
" <th>number_y</th>\n",
" <th>created_at_y</th>\n",
" <th>updated_at_y</th>\n",
" <th>purchase_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>type_of</th>\n",
" <th>supplier_id</th>\n",
" <th>barcode</th>\n",
" <th>identifier_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>861763</td>\n",
" <td>2019-03-01 16:29:17+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e1155cf26b34f792bdb23e49244d7264</td>\n",
" <td>2119082</td>\n",
" <td>1433_136_194_68356</td>\n",
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
" <td>861763</td>\n",
" <td>209879</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>838d6101db2fc8bc80536d8b91b49859</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>861764</td>\n",
" <td>2019-03-01 16:29:19+01:00</td>\n",
" <td>4966</td>\n",
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
" <td>NaN</td>\n",
" <td>e8b95cc6a1a8b103ffa39755ce3bfc4d</td>\n",
" <td>2119081</td>\n",
" <td>1433_136_212_68356</td>\n",
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
" <td>861764</td>\n",
" <td>209879</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>f694c255855ce5643c6fcc7fed5e9237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>861767</td>\n",
" <td>2019-03-01 16:33:01+01:00</td>\n",
" <td>405994</td>\n",
" <td>2023-09-12 17:42:37.573280+02:00</td>\n",
" <td>2023-09-12 17:42:37.573280+02:00</td>\n",
" <td>NaN</td>\n",
" <td>6edb259b88fc6f6ae82ede82defaef92</td>\n",
" <td>2119084</td>\n",
" <td>33158_158_297_68357</td>\n",
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
" <td>861767</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>b7a3dd0794c0957c942d45b8913e5b96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>861768</td>\n",
" <td>2019-03-01 16:33:03+01:00</td>\n",
" <td>405994</td>\n",
" <td>2023-09-12 17:42:37.573646+02:00</td>\n",
" <td>2023-09-12 17:42:37.573646+02:00</td>\n",
" <td>NaN</td>\n",
" <td>5d3fcb50784bada3731a967ddc9fbba8</td>\n",
" <td>2119085</td>\n",
" <td>33158_158_318_68357</td>\n",
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
" <td>861768</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>d7ea7e443581ebe520dd13f6cad31af7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>861769</td>\n",
" <td>2019-03-01 16:33:06+01:00</td>\n",
" <td>405994</td>\n",
" <td>2023-09-12 17:42:37.574034+02:00</td>\n",
" <td>2023-09-12 17:42:37.574034+02:00</td>\n",
" <td>NaN</td>\n",
" <td>5516d19b2331db9ad0b11f7e70299575</td>\n",
" <td>2119083</td>\n",
" <td>33158_158_343_68357</td>\n",
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
" <td>861769</td>\n",
" <td>209880</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>8a8d938d66a4dc57bcb44c2773c6fdfa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>1285206</td>\n",
" <td>2023-10-19 22:14:55+02:00</td>\n",
" <td>354233</td>\n",
" <td>2023-10-21 04:50:44.397308+02:00</td>\n",
" <td>2023-10-21 04:50:44.397308+02:00</td>\n",
" <td>NaN</td>\n",
" <td>819dd5c8b312ee583335f32f481d782a</td>\n",
" <td>2597564</td>\n",
" <td>70649_398_403_168652</td>\n",
" <td>2023-10-21 04:50:44.991960+02:00</td>\n",
" <td>2023-10-21 04:50:44.991960+02:00</td>\n",
" <td>1285206</td>\n",
" <td>270350</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>56c452c39089f658ed74a06c96b78725</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>1285209</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>517001</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>NaN</td>\n",
" <td>ef79fbeb3b80de3529df9c65cb4d4ca2</td>\n",
" <td>2597565</td>\n",
" <td>89203_398_1187_168656</td>\n",
" <td>2023-10-21 04:50:44.993354+02:00</td>\n",
" <td>2023-10-21 04:50:44.993354+02:00</td>\n",
" <td>1285209</td>\n",
" <td>268450</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>5ef9912e7b533b8a1b2685db538df7d3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>1285209</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>517001</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>NaN</td>\n",
" <td>ef79fbeb3b80de3529df9c65cb4d4ca2</td>\n",
" <td>2597566</td>\n",
" <td>89203_398_1232_168655</td>\n",
" <td>2023-10-21 04:50:44.994301+02:00</td>\n",
" <td>2023-10-21 04:50:44.994301+02:00</td>\n",
" <td>1285209</td>\n",
" <td>272403</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>9742a56e9ffbdfb0a31a541dc5ccb889</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>1285209</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>517001</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>2023-10-21 04:50:44.399870+02:00</td>\n",
" <td>NaN</td>\n",
" <td>ef79fbeb3b80de3529df9c65cb4d4ca2</td>\n",
" <td>2597567</td>\n",
" <td>89203_398_1211_168655</td>\n",
" <td>2023-10-21 04:50:44.995318+02:00</td>\n",
" <td>2023-10-21 04:50:44.995318+02:00</td>\n",
" <td>1285209</td>\n",
" <td>272403</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>56a9e032281d7a9c004da644818839cc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>1285966</td>\n",
" <td>2023-10-21 21:47:20+02:00</td>\n",
" <td>517309</td>\n",
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
" <td>NaN</td>\n",
" <td>7e825dd352bc6a11ab81cb8068e325e6</td>\n",
" <td>2598260</td>\n",
" <td>89257_401_2652_168793</td>\n",
" <td>2023-10-23 03:43:16.856244+02:00</td>\n",
" <td>2023-10-23 03:43:16.856244+02:00</td>\n",
" <td>1285966</td>\n",
" <td>268428</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>1702</td>\n",
" <td>NaN</td>\n",
" <td>86d6c0c2720435206078ac4bbf4f74f1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" id_x purchase_date customer_id \\\n",
"0 861763 2019-03-01 16:29:17+01:00 4966 \n",
"1 861764 2019-03-01 16:29:19+01:00 4966 \n",
"2 861767 2019-03-01 16:33:01+01:00 405994 \n",
"3 861768 2019-03-01 16:33:03+01:00 405994 \n",
"4 861769 2019-03-01 16:33:06+01:00 405994 \n",
"... ... ... ... \n",
"318964 1285206 2023-10-19 22:14:55+02:00 354233 \n",
"318965 1285209 2023-10-19 22:59:26+02:00 517001 \n",
"318966 1285209 2023-10-19 22:59:26+02:00 517001 \n",
"318967 1285209 2023-10-19 22:59:26+02:00 517001 \n",
"318968 1285966 2023-10-21 21:47:20+02:00 517309 \n",
"\n",
" created_at_x updated_at_x \\\n",
"0 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n",
"1 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n",
"2 2023-09-12 17:42:37.573280+02:00 2023-09-12 17:42:37.573280+02:00 \n",
"3 2023-09-12 17:42:37.573646+02:00 2023-09-12 17:42:37.573646+02:00 \n",
"4 2023-09-12 17:42:37.574034+02:00 2023-09-12 17:42:37.574034+02:00 \n",
"... ... ... \n",
"318964 2023-10-21 04:50:44.397308+02:00 2023-10-21 04:50:44.397308+02:00 \n",
"318965 2023-10-21 04:50:44.399870+02:00 2023-10-21 04:50:44.399870+02:00 \n",
"318966 2023-10-21 04:50:44.399870+02:00 2023-10-21 04:50:44.399870+02:00 \n",
"318967 2023-10-21 04:50:44.399870+02:00 2023-10-21 04:50:44.399870+02:00 \n",
"318968 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n",
"\n",
" number_x identifier_x id_y \\\n",
"0 NaN e1155cf26b34f792bdb23e49244d7264 2119082 \n",
"1 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d 2119081 \n",
"2 NaN 6edb259b88fc6f6ae82ede82defaef92 2119084 \n",
"3 NaN 5d3fcb50784bada3731a967ddc9fbba8 2119085 \n",
"4 NaN 5516d19b2331db9ad0b11f7e70299575 2119083 \n",
"... ... ... ... \n",
"318964 NaN 819dd5c8b312ee583335f32f481d782a 2597564 \n",
"318965 NaN ef79fbeb3b80de3529df9c65cb4d4ca2 2597565 \n",
"318966 NaN ef79fbeb3b80de3529df9c65cb4d4ca2 2597566 \n",
"318967 NaN ef79fbeb3b80de3529df9c65cb4d4ca2 2597567 \n",
"318968 NaN 7e825dd352bc6a11ab81cb8068e325e6 2598260 \n",
"\n",
" number_y created_at_y \\\n",
"0 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n",
"1 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n",
"2 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n",
"3 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n",
"4 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n",
"... ... ... \n",
"318964 70649_398_403_168652 2023-10-21 04:50:44.991960+02:00 \n",
"318965 89203_398_1187_168656 2023-10-21 04:50:44.993354+02:00 \n",
"318966 89203_398_1232_168655 2023-10-21 04:50:44.994301+02:00 \n",
"318967 89203_398_1211_168655 2023-10-21 04:50:44.995318+02:00 \n",
"318968 89257_401_2652_168793 2023-10-23 03:43:16.856244+02:00 \n",
"\n",
" updated_at_y purchase_id product_id \\\n",
"0 2023-09-12 17:42:45.409056+02:00 861763 209879 \n",
"1 2023-09-12 17:42:45.396336+02:00 861764 209879 \n",
"2 2023-09-12 17:42:45.410447+02:00 861767 209880 \n",
"3 2023-09-12 17:42:45.411059+02:00 861768 209880 \n",
"4 2023-09-12 17:42:45.409824+02:00 861769 209880 \n",
"... ... ... ... \n",
"318964 2023-10-21 04:50:44.991960+02:00 1285206 270350 \n",
"318965 2023-10-21 04:50:44.993354+02:00 1285209 268450 \n",
"318966 2023-10-21 04:50:44.994301+02:00 1285209 272403 \n",
"318967 2023-10-21 04:50:44.995318+02:00 1285209 272403 \n",
"318968 2023-10-23 03:43:16.856244+02:00 1285966 268428 \n",
"\n",
" is_from_subscription type_of supplier_id barcode \\\n",
"0 False 1 1702 NaN \n",
"1 False 1 1702 NaN \n",
"2 False 1 1702 NaN \n",
"3 False 1 1702 NaN \n",
"4 False 1 1702 NaN \n",
"... ... ... ... ... \n",
"318964 False 1 1702 NaN \n",
"318965 False 1 1702 NaN \n",
"318966 False 1 1702 NaN \n",
"318967 False 1 1702 NaN \n",
"318968 False 1 1702 NaN \n",
"\n",
" identifier_y \n",
"0 838d6101db2fc8bc80536d8b91b49859 \n",
"1 f694c255855ce5643c6fcc7fed5e9237 \n",
"2 b7a3dd0794c0957c942d45b8913e5b96 \n",
"3 d7ea7e443581ebe520dd13f6cad31af7 \n",
"4 8a8d938d66a4dc57bcb44c2773c6fdfa \n",
"... ... \n",
"318964 56c452c39089f658ed74a06c96b78725 \n",
"318965 5ef9912e7b533b8a1b2685db538df7d3 \n",
"318966 9742a56e9ffbdfb0a31a541dc5ccb889 \n",
"318967 56a9e032281d7a9c004da644818839cc \n",
"318968 86d6c0c2720435206078ac4bbf4f74f1 \n",
"\n",
"[318969 rows x 18 columns]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "e8f340b3-7519-47e7-a8bb-c8d1b68ca683",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>product_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>861763</td>\n",
" <td>4966</td>\n",
" <td>209879</td>\n",
" <td>2019-03-01 16:29:17+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>861764</td>\n",
" <td>4966</td>\n",
" <td>209879</td>\n",
" <td>2019-03-01 16:29:19+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>861767</td>\n",
" <td>405994</td>\n",
" <td>209880</td>\n",
" <td>2019-03-01 16:33:01+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>861768</td>\n",
" <td>405994</td>\n",
" <td>209880</td>\n",
" <td>2019-03-01 16:33:03+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>861769</td>\n",
" <td>405994</td>\n",
" <td>209880</td>\n",
" <td>2019-03-01 16:33:06+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>1285206</td>\n",
" <td>354233</td>\n",
" <td>270350</td>\n",
" <td>2023-10-19 22:14:55+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>1285209</td>\n",
" <td>517001</td>\n",
" <td>268450</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>1285209</td>\n",
" <td>517001</td>\n",
" <td>272403</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>1285209</td>\n",
" <td>517001</td>\n",
" <td>272403</td>\n",
" <td>2023-10-19 22:59:26+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>1285966</td>\n",
" <td>517309</td>\n",
" <td>268428</td>\n",
" <td>2023-10-21 21:47:20+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" id_x customer_id product_id purchase_date type_of \\\n",
"0 861763 4966 209879 2019-03-01 16:29:17+01:00 1 \n",
"1 861764 4966 209879 2019-03-01 16:29:19+01:00 1 \n",
"2 861767 405994 209880 2019-03-01 16:33:01+01:00 1 \n",
"3 861768 405994 209880 2019-03-01 16:33:03+01:00 1 \n",
"4 861769 405994 209880 2019-03-01 16:33:06+01:00 1 \n",
"... ... ... ... ... ... \n",
"318964 1285206 354233 270350 2023-10-19 22:14:55+02:00 1 \n",
"318965 1285209 517001 268450 2023-10-19 22:59:26+02:00 1 \n",
"318966 1285209 517001 272403 2023-10-19 22:59:26+02:00 1 \n",
"318967 1285209 517001 272403 2023-10-19 22:59:26+02:00 1 \n",
"318968 1285966 517309 268428 2023-10-21 21:47:20+02:00 1 \n",
"\n",
" is_from_subscription \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"... ... \n",
"318964 False \n",
"318965 False \n",
"318966 False \n",
"318967 False \n",
"318968 False \n",
"\n",
"[318969 rows x 6 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Jonction client et évenement\n",
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
"merge_1"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "a598b86c-4128-4e5c-ae38-52689f755fd5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>representation_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1249847</td>\n",
" <td>7634</td>\n",
" <td>44332</td>\n",
" <td>2023-09-03 18:43:56+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1249847</td>\n",
" <td>7634</td>\n",
" <td>44332</td>\n",
" <td>2023-09-03 18:43:56+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1252665</td>\n",
" <td>426962</td>\n",
" <td>44332</td>\n",
" <td>2023-07-06 12:13:08+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1252665</td>\n",
" <td>426962</td>\n",
" <td>44332</td>\n",
" <td>2023-07-06 12:13:08+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1252671</td>\n",
" <td>426731</td>\n",
" <td>44332</td>\n",
" <td>2023-07-06 13:10:07+02:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>1212797</td>\n",
" <td>11092</td>\n",
" <td>33810</td>\n",
" <td>2018-11-28 13:44:22+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>1213476</td>\n",
" <td>25851</td>\n",
" <td>33810</td>\n",
" <td>2018-12-28 16:53:36+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>1226039</td>\n",
" <td>26314</td>\n",
" <td>33810</td>\n",
" <td>2018-12-29 16:30:47+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>1244276</td>\n",
" <td>3104</td>\n",
" <td>33810</td>\n",
" <td>2018-12-31 19:54:09+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>1244285</td>\n",
" <td>86</td>\n",
" <td>33766</td>\n",
" <td>2019-12-31 13:02:47+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>21.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" id_x customer_id representation_id purchase_date \\\n",
"0 1249847 7634 44332 2023-09-03 18:43:56+02:00 \n",
"1 1249847 7634 44332 2023-09-03 18:43:56+02:00 \n",
"2 1252665 426962 44332 2023-07-06 12:13:08+02:00 \n",
"3 1252665 426962 44332 2023-07-06 12:13:08+02:00 \n",
"4 1252671 426731 44332 2023-07-06 13:10:07+02:00 \n",
"... ... ... ... ... \n",
"318964 1212797 11092 33810 2018-11-28 13:44:22+01:00 \n",
"318965 1213476 25851 33810 2018-12-28 16:53:36+01:00 \n",
"318966 1226039 26314 33810 2018-12-29 16:30:47+01:00 \n",
"318967 1244276 3104 33810 2018-12-31 19:54:09+01:00 \n",
"318968 1244285 86 33766 2019-12-31 13:02:47+01:00 \n",
"\n",
" type_of is_from_subscription amount is_full_price \n",
"0 1 False 18.0 False \n",
"1 1 False 18.0 False \n",
"2 1 False 18.0 False \n",
"3 1 False 18.0 False \n",
"4 3 False 18.0 False \n",
"... ... ... ... ... \n",
"318964 1 False 30.0 False \n",
"318965 1 False 30.0 False \n",
"318966 1 False 30.0 False \n",
"318967 1 False 30.0 False \n",
"318968 3 False 21.0 False \n",
"\n",
"[318969 rows x 8 columns]"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
"merge_2"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "9d394f79-2615-448e-8ebd-074e225f1584",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>serial</th>\n",
" <th>event_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>start_date_time</th>\n",
" <th>open</th>\n",
" <th>satisfaction</th>\n",
" <th>end_date_time</th>\n",
" <th>name</th>\n",
" <th>...</th>\n",
" <th>extra_field</th>\n",
" <th>identifier</th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>representation_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>44351</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>44351</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>44351</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>44351</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>44351</td>\n",
" <td>NaN</td>\n",
" <td>20371</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>44351</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>33639</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>33639</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>33639</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" <td>1194433</td>\n",
" <td>412831</td>\n",
" <td>33639</td>\n",
" <td>2023-03-27 17:38:59+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>33639</td>\n",
" <td>NaN</td>\n",
" <td>15533</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
" <td>1194433</td>\n",
" <td>412831</td>\n",
" <td>33639</td>\n",
" <td>2023-03-27 17:38:59+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" id serial event_id created_at \\\n",
"0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"1 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"2 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"3 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"4 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
"... ... ... ... ... \n",
"318964 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"318965 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"318966 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"318967 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"318968 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
"\n",
" updated_at start_date_time open \\\n",
"0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"1 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"2 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"3 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"4 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
"... ... ... ... \n",
"318964 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"318965 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"318966 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"318967 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"318968 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
"\n",
" satisfaction end_date_time name ... extra_field \\\n",
"0 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"1 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"2 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"3 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"4 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"... ... ... ... ... ... \n",
"318964 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"318965 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"318966 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"318967 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"318968 NaN 1901-01-01 00:09:21+00:09 NaN ... NaN \n",
"\n",
" identifier id_x customer_id \\\n",
"0 33520762e8cc28982e3841cbc2be8ce2 1293590 627 \n",
"1 33520762e8cc28982e3841cbc2be8ce2 1293590 627 \n",
"2 33520762e8cc28982e3841cbc2be8ce2 1293590 627 \n",
"3 33520762e8cc28982e3841cbc2be8ce2 1293590 627 \n",
"4 33520762e8cc28982e3841cbc2be8ce2 1293590 627 \n",
"... ... ... ... \n",
"318964 fae68f1e09710ec8747957af6e22f61d 1183026 15258 \n",
"318965 fae68f1e09710ec8747957af6e22f61d 1183026 15258 \n",
"318966 fae68f1e09710ec8747957af6e22f61d 1183026 15258 \n",
"318967 fae68f1e09710ec8747957af6e22f61d 1194433 412831 \n",
"318968 fae68f1e09710ec8747957af6e22f61d 1194433 412831 \n",
"\n",
" representation_id purchase_date type_of \\\n",
"0 44351 2023-11-08 12:25:21+01:00 0 \n",
"1 44351 2023-11-08 12:25:21+01:00 0 \n",
"2 44351 2023-11-08 12:25:21+01:00 0 \n",
"3 44351 2023-11-08 12:25:21+01:00 0 \n",
"4 44351 2023-11-08 12:25:21+01:00 0 \n",
"... ... ... ... \n",
"318964 33639 2023-03-26 16:09:31+02:00 1 \n",
"318965 33639 2023-03-26 16:09:31+02:00 1 \n",
"318966 33639 2023-03-26 16:09:31+02:00 1 \n",
"318967 33639 2023-03-27 17:38:59+02:00 1 \n",
"318968 33639 2023-03-27 17:38:59+02:00 1 \n",
"\n",
" is_from_subscription amount is_full_price \n",
"0 False 22.4 False \n",
"1 False 22.4 False \n",
"2 False 22.4 False \n",
"3 False 22.4 False \n",
"4 False 22.4 False \n",
"... ... ... ... \n",
"318964 False 0.0 False \n",
"318965 False 0.0 False \n",
"318966 False 0.0 False \n",
"318967 False 0.0 False \n",
"318968 False 0.0 False \n",
"\n",
"[318969 rows x 24 columns]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "63bcbfad-fa20-425a-881f-ca9aa212c419",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>event_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>start_date_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>20371</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>20371</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>20371</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>20371</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1293590</td>\n",
" <td>627</td>\n",
" <td>20371</td>\n",
" <td>2023-11-08 12:25:21+01:00</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>22.4</td>\n",
" <td>False</td>\n",
" <td>2023-12-21 20:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>15533</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>15533</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>1183026</td>\n",
" <td>15258</td>\n",
" <td>15533</td>\n",
" <td>2023-03-26 16:09:31+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>1194433</td>\n",
" <td>412831</td>\n",
" <td>15533</td>\n",
" <td>2023-03-27 17:38:59+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>1194433</td>\n",
" <td>412831</td>\n",
" <td>15533</td>\n",
" <td>2023-03-27 17:38:59+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>2023-04-15 17:30:00+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" id_x customer_id event_id purchase_date type_of \\\n",
"0 1293590 627 20371 2023-11-08 12:25:21+01:00 0 \n",
"1 1293590 627 20371 2023-11-08 12:25:21+01:00 0 \n",
"2 1293590 627 20371 2023-11-08 12:25:21+01:00 0 \n",
"3 1293590 627 20371 2023-11-08 12:25:21+01:00 0 \n",
"4 1293590 627 20371 2023-11-08 12:25:21+01:00 0 \n",
"... ... ... ... ... ... \n",
"318964 1183026 15258 15533 2023-03-26 16:09:31+02:00 1 \n",
"318965 1183026 15258 15533 2023-03-26 16:09:31+02:00 1 \n",
"318966 1183026 15258 15533 2023-03-26 16:09:31+02:00 1 \n",
"318967 1194433 412831 15533 2023-03-27 17:38:59+02:00 1 \n",
"318968 1194433 412831 15533 2023-03-27 17:38:59+02:00 1 \n",
"\n",
" is_from_subscription amount is_full_price start_date_time \n",
"0 False 22.4 False 2023-12-21 20:00:00+01:00 \n",
"1 False 22.4 False 2023-12-21 20:00:00+01:00 \n",
"2 False 22.4 False 2023-12-21 20:00:00+01:00 \n",
"3 False 22.4 False 2023-12-21 20:00:00+01:00 \n",
"4 False 22.4 False 2023-12-21 20:00:00+01:00 \n",
"... ... ... ... ... \n",
"318964 False 0.0 False 2023-04-15 17:30:00+02:00 \n",
"318965 False 0.0 False 2023-04-15 17:30:00+02:00 \n",
"318966 False 0.0 False 2023-04-15 17:30:00+02:00 \n",
"318967 False 0.0 False 2023-04-15 17:30:00+02:00 \n",
"318968 False 0.0 False 2023-04-15 17:30:00+02:00 \n",
"\n",
"[318969 rows x 9 columns]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
"merge_3"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "db52559b-6562-439b-b16e-f5d8dc9bc891",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>season_id</th>\n",
" <th>facility_id</th>\n",
" <th>name</th>\n",
" <th>event_type_id</th>\n",
" <th>manual_added</th>\n",
" <th>is_display</th>\n",
" <th>event_type_key_id</th>\n",
" <th>...</th>\n",
" <th>identifier</th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>event_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>start_date_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" <td>1253614</td>\n",
" <td>432123</td>\n",
" <td>20367</td>\n",
" <td>2023-09-07 18:02:58+02:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>2.0</td>\n",
" <td>False</td>\n",
" <td>2023-11-29 14:30:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" <td>1253614</td>\n",
" <td>432123</td>\n",
" <td>20367</td>\n",
" <td>2023-09-07 18:02:58+02:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>2.0</td>\n",
" <td>False</td>\n",
" <td>2023-11-29 14:30:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" <td>1252930</td>\n",
" <td>431824</td>\n",
" <td>20367</td>\n",
" <td>2023-09-06 16:06:40+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.0</td>\n",
" <td>False</td>\n",
" <td>2023-11-29 14:30:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" <td>1252931</td>\n",
" <td>431824</td>\n",
" <td>20367</td>\n",
" <td>2023-09-06 16:06:42+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.0</td>\n",
" <td>False</td>\n",
" <td>2023-11-29 14:30:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20367</td>\n",
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
" <td>1865</td>\n",
" <td>1054</td>\n",
" <td>marelle</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
" <td>1252932</td>\n",
" <td>431824</td>\n",
" <td>20367</td>\n",
" <td>2023-09-06 16:06:44+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.0</td>\n",
" <td>False</td>\n",
" <td>2023-11-29 14:30:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" <td>1206691</td>\n",
" <td>358863</td>\n",
" <td>15439</td>\n",
" <td>2023-03-28 17:53:40+02:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-29 20:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" <td>1218071</td>\n",
" <td>413015</td>\n",
" <td>15439</td>\n",
" <td>2023-03-29 17:01:01+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-29 20:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" <td>1218125</td>\n",
" <td>344045</td>\n",
" <td>15439</td>\n",
" <td>2023-03-29 18:20:05+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-29 20:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" <td>1218185</td>\n",
" <td>381006</td>\n",
" <td>15439</td>\n",
" <td>2023-03-29 19:50:18+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-29 20:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>15439</td>\n",
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
" <td>1708</td>\n",
" <td>1054</td>\n",
" <td>florilege</td>\n",
" <td>1055</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1055</td>\n",
" <td>...</td>\n",
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
" <td>1239074</td>\n",
" <td>4512</td>\n",
" <td>15439</td>\n",
" <td>2023-01-31 16:14:27+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-29 20:00:00+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" id created_at \\\n",
"0 20367 2023-09-13 03:42:45.214293+02:00 \n",
"1 20367 2023-09-13 03:42:45.214293+02:00 \n",
"2 20367 2023-09-13 03:42:45.214293+02:00 \n",
"3 20367 2023-09-13 03:42:45.214293+02:00 \n",
"4 20367 2023-09-13 03:42:45.214293+02:00 \n",
"... ... ... \n",
"318964 15439 2023-09-12 17:42:25.252747+02:00 \n",
"318965 15439 2023-09-12 17:42:25.252747+02:00 \n",
"318966 15439 2023-09-12 17:42:25.252747+02:00 \n",
"318967 15439 2023-09-12 17:42:25.252747+02:00 \n",
"318968 15439 2023-09-12 17:42:25.252747+02:00 \n",
"\n",
" updated_at season_id facility_id name \\\n",
"0 2023-09-13 03:54:30.086969+02:00 1865 1054 marelle \n",
"1 2023-09-13 03:54:30.086969+02:00 1865 1054 marelle \n",
"2 2023-09-13 03:54:30.086969+02:00 1865 1054 marelle \n",
"3 2023-09-13 03:54:30.086969+02:00 1865 1054 marelle \n",
"4 2023-09-13 03:54:30.086969+02:00 1865 1054 marelle \n",
"... ... ... ... ... \n",
"318964 2023-09-12 19:00:00.735990+02:00 1708 1054 florilege \n",
"318965 2023-09-12 19:00:00.735990+02:00 1708 1054 florilege \n",
"318966 2023-09-12 19:00:00.735990+02:00 1708 1054 florilege \n",
"318967 2023-09-12 19:00:00.735990+02:00 1708 1054 florilege \n",
"318968 2023-09-12 19:00:00.735990+02:00 1708 1054 florilege \n",
"\n",
" event_type_id manual_added is_display event_type_key_id ... \\\n",
"0 1055 False True 1055 ... \n",
"1 1055 False True 1055 ... \n",
"2 1055 False True 1055 ... \n",
"3 1055 False True 1055 ... \n",
"4 1055 False True 1055 ... \n",
"... ... ... ... ... ... \n",
"318964 1055 False True 1055 ... \n",
"318965 1055 False True 1055 ... \n",
"318966 1055 False True 1055 ... \n",
"318967 1055 False True 1055 ... \n",
"318968 1055 False True 1055 ... \n",
"\n",
" identifier id_x customer_id event_id \\\n",
"0 26d1e9a4acad18b9cf79244334c86c93 1253614 432123 20367 \n",
"1 26d1e9a4acad18b9cf79244334c86c93 1253614 432123 20367 \n",
"2 26d1e9a4acad18b9cf79244334c86c93 1252930 431824 20367 \n",
"3 26d1e9a4acad18b9cf79244334c86c93 1252931 431824 20367 \n",
"4 26d1e9a4acad18b9cf79244334c86c93 1252932 431824 20367 \n",
"... ... ... ... ... \n",
"318964 4f015946bcbd856aa573cadb7ac42b9f 1206691 358863 15439 \n",
"318965 4f015946bcbd856aa573cadb7ac42b9f 1218071 413015 15439 \n",
"318966 4f015946bcbd856aa573cadb7ac42b9f 1218125 344045 15439 \n",
"318967 4f015946bcbd856aa573cadb7ac42b9f 1218185 381006 15439 \n",
"318968 4f015946bcbd856aa573cadb7ac42b9f 1239074 4512 15439 \n",
"\n",
" purchase_date type_of is_from_subscription amount \\\n",
"0 2023-09-07 18:02:58+02:00 3 False 2.0 \n",
"1 2023-09-07 18:02:58+02:00 3 False 2.0 \n",
"2 2023-09-06 16:06:40+02:00 1 False 5.0 \n",
"3 2023-09-06 16:06:42+02:00 1 False 5.0 \n",
"4 2023-09-06 16:06:44+02:00 1 False 5.0 \n",
"... ... ... ... ... \n",
"318964 2023-03-28 17:53:40+02:00 3 False 4.0 \n",
"318965 2023-03-29 17:01:01+02:00 1 False 4.0 \n",
"318966 2023-03-29 18:20:05+02:00 1 False 4.0 \n",
"318967 2023-03-29 19:50:18+02:00 1 False 4.0 \n",
"318968 2023-01-31 16:14:27+01:00 1 False 4.0 \n",
"\n",
" is_full_price start_date_time \n",
"0 False 2023-11-29 14:30:00+01:00 \n",
"1 False 2023-11-29 14:30:00+01:00 \n",
"2 False 2023-11-29 14:30:00+01:00 \n",
"3 False 2023-11-29 14:30:00+01:00 \n",
"4 False 2023-11-29 14:30:00+01:00 \n",
"... ... ... \n",
"318964 False 2023-03-29 20:00:00+02:00 \n",
"318965 False 2023-03-29 20:00:00+02:00 \n",
"318966 False 2023-03-29 20:00:00+02:00 \n",
"318967 False 2023-03-29 20:00:00+02:00 \n",
"318968 False 2023-03-29 20:00:00+02:00 \n",
"\n",
"[318969 rows x 21 columns]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "d8ab2477-c199-4815-88d9-c5683e466772",
"metadata": {},
"outputs": [],
"source": [
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
"merge_4 = merge_4.rename(columns={'name': 'event_name'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0917b77-6a73-4ae3-a58a-0bb7964f1406",
"metadata": {},
"outputs": [],
"source": [
"merge_5 = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}