3490 lines
132 KiB
Plaintext
3490 lines
132 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Business Data Challenge - Team 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
|
||
"metadata": {},
|
||
"source": [
|
||
"Configuration de l'accès aux données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Exemple sur bdc2324-data/11"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['bdc2324-data/11/11campaign_stats.csv',\n",
|
||
" 'bdc2324-data/11/11campaigns.csv',\n",
|
||
" 'bdc2324-data/11/11categories.csv',\n",
|
||
" 'bdc2324-data/11/11countries.csv',\n",
|
||
" 'bdc2324-data/11/11currencies.csv',\n",
|
||
" 'bdc2324-data/11/11customer_target_mappings.csv',\n",
|
||
" 'bdc2324-data/11/11customersplus.csv',\n",
|
||
" 'bdc2324-data/11/11event_types.csv',\n",
|
||
" 'bdc2324-data/11/11events.csv',\n",
|
||
" 'bdc2324-data/11/11facilities.csv',\n",
|
||
" 'bdc2324-data/11/11link_stats.csv',\n",
|
||
" 'bdc2324-data/11/11pricing_formulas.csv',\n",
|
||
" 'bdc2324-data/11/11product_packs.csv',\n",
|
||
" 'bdc2324-data/11/11products.csv',\n",
|
||
" 'bdc2324-data/11/11products_groups.csv',\n",
|
||
" 'bdc2324-data/11/11purchases.csv',\n",
|
||
" 'bdc2324-data/11/11representation_category_capacities.csv',\n",
|
||
" 'bdc2324-data/11/11representations.csv',\n",
|
||
" 'bdc2324-data/11/11seasons.csv',\n",
|
||
" 'bdc2324-data/11/11structure_tag_mappings.csv',\n",
|
||
" 'bdc2324-data/11/11suppliers.csv',\n",
|
||
" 'bdc2324-data/11/11tags.csv',\n",
|
||
" 'bdc2324-data/11/11target_types.csv',\n",
|
||
" 'bdc2324-data/11/11targets.csv',\n",
|
||
" 'bdc2324-data/11/11tickets.csv']"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data/11\"\n",
|
||
"fs.ls(BUCKET)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "6d6201cd-a00b-4984-bcd8-72838717ad13",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Chargement de toutes les données\n",
|
||
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
||
"\n",
|
||
"for nom_base in liste_base:\n",
|
||
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "afe548fe-d93c-4634-9f53-881404ec4c6c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id_x</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>type_of</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>start_date_time</th>\n",
|
||
" <th>event_name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>992423</td>\n",
|
||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
||
" <td>zaide</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>992423</td>\n",
|
||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
||
" <td>zaide</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1053934</td>\n",
|
||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
||
" <td>luisa miller</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1053934</td>\n",
|
||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
||
" <td>luisa miller</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1189141</td>\n",
|
||
" <td>2020-11-26 13:12:53+01:00</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>51.3</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2020-12-01 20:00:00+01:00</td>\n",
|
||
" <td>iphigenie en tauride</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318964</th>\n",
|
||
" <td>1090839</td>\n",
|
||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>4.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
||
" <td>entre femmes</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318965</th>\n",
|
||
" <td>1090839</td>\n",
|
||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>4.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
||
" <td>entre femmes</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318966</th>\n",
|
||
" <td>1090839</td>\n",
|
||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>4.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
||
" <td>entre femmes</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318967</th>\n",
|
||
" <td>1244277</td>\n",
|
||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>5.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
||
" <td>a boire et a manger</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318968</th>\n",
|
||
" <td>1244277</td>\n",
|
||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>5.5</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
||
" <td>a boire et a manger</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>318969 rows × 8 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id_x purchase_date type_of is_from_subscription \\\n",
|
||
"0 992423 2023-01-11 17:08:41+01:00 3 False \n",
|
||
"1 992423 2023-01-11 17:08:41+01:00 3 False \n",
|
||
"2 1053934 2023-03-16 16:23:10+01:00 3 False \n",
|
||
"3 1053934 2023-03-16 16:23:10+01:00 3 False \n",
|
||
"4 1189141 2020-11-26 13:12:53+01:00 3 False \n",
|
||
"... ... ... ... ... \n",
|
||
"318964 1090839 2019-05-19 21:18:36+02:00 1 False \n",
|
||
"318965 1090839 2019-05-19 21:18:36+02:00 1 False \n",
|
||
"318966 1090839 2019-05-19 21:18:36+02:00 1 False \n",
|
||
"318967 1244277 2019-12-31 11:04:07+01:00 1 False \n",
|
||
"318968 1244277 2019-12-31 11:04:07+01:00 1 False \n",
|
||
"\n",
|
||
" amount is_full_price start_date_time event_name \n",
|
||
"0 13.0 False 2023-02-06 20:00:00+01:00 zaide \n",
|
||
"1 13.0 False 2023-02-06 20:00:00+01:00 zaide \n",
|
||
"2 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n",
|
||
"3 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n",
|
||
"4 51.3 False 2020-12-01 20:00:00+01:00 iphigenie en tauride \n",
|
||
"... ... ... ... ... \n",
|
||
"318964 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
|
||
"318965 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
|
||
"318966 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
|
||
"318967 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n",
|
||
"318968 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n",
|
||
"\n",
|
||
"[318969 rows x 8 columns]"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Jointure\n",
|
||
"var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
|
||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
|
||
"\n",
|
||
"var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
|
||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
|
||
"\n",
|
||
"var_choosed.remove('representation_id')\n",
|
||
"var_choosed.extend(['start_date_time', 'event_id'])\n",
|
||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
|
||
"\n",
|
||
"var_choosed.remove('event_id')\n",
|
||
"var_choosed.extend(['name', 'customer_id'])\n",
|
||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
|
||
"\n",
|
||
"# Changement de nom\n",
|
||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
||
"var_choosed[var_choosed.index('name')] = \"event_name\"\n",
|
||
"\n",
|
||
"# Base finale\n",
|
||
"var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
|
||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
|
||
"df_customer_event"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Type de client au globale"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
|
||
" 'extra_field'],\n",
|
||
" dtype='object')\n",
|
||
"(124302, 7)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 124302 entries, 0 to 124301\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 124302 non-null int64 \n",
|
||
" 1 customer_id 124302 non-null int64 \n",
|
||
" 2 target_id 124302 non-null int64 \n",
|
||
" 3 created_at 124296 non-null object \n",
|
||
" 4 updated_at 124296 non-null object \n",
|
||
" 5 name 0 non-null float64\n",
|
||
" 6 extra_field 0 non-null float64\n",
|
||
"dtypes: float64(2), int64(3), object(2)\n",
|
||
"memory usage: 6.6+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Client\n",
|
||
"print(customer_target_mappings.columns)\n",
|
||
"print(customer_target_mappings.shape)\n",
|
||
"customer_target_mappings.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([nan])"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"customer_target_mappings['extra_field'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "47bc8453-0693-4838-8bd8-4d800a82c496",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([nan])"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"customer_target_mappings['name'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
|
||
"(4, 6)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 4 entries, 0 to 3\n",
|
||
"Data columns (total 6 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 4 non-null int64 \n",
|
||
" 1 is_import 4 non-null bool \n",
|
||
" 2 name 4 non-null object\n",
|
||
" 3 created_at 4 non-null object\n",
|
||
" 4 updated_at 4 non-null object\n",
|
||
" 5 identifier 4 non-null object\n",
|
||
"dtypes: bool(1), int64(1), object(4)\n",
|
||
"memory usage: 292.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Segmentation existante\n",
|
||
"print(target_types.columns)\n",
|
||
"print(target_types.shape)\n",
|
||
"target_types.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>is_import</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
|
||
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
|
||
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>manual_structure</td>\n",
|
||
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
|
||
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
|
||
" <td>382bca214204a2d3462f5ec2728d5d1e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_dynamic_filter</td>\n",
|
||
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
|
||
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
|
||
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>manual_import</td>\n",
|
||
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
|
||
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
|
||
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id is_import name created_at \\\n",
|
||
"0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
|
||
"1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
|
||
"2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
|
||
"3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
|
||
"\n",
|
||
" updated_at identifier \n",
|
||
"0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
|
||
"1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
|
||
"2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
|
||
"3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"target_types"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "8dd74e87-97c2-493d-b19f-971b684078d3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
|
||
"(20, 5)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 20 entries, 0 to 19\n",
|
||
"Data columns (total 5 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 20 non-null int64 \n",
|
||
" 1 name 19 non-null object\n",
|
||
" 2 created_at 20 non-null object\n",
|
||
" 3 updated_at 20 non-null object\n",
|
||
" 4 identifier 20 non-null object\n",
|
||
"dtypes: int64(1), object(4)\n",
|
||
"memory usage: 928.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Tags = clients\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" tags = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(tags.columns)\n",
|
||
"print(tags.shape)\n",
|
||
"tags.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "91d54732-666c-4250-ba91-5c9b83d4712a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>ens-écoles</td>\n",
|
||
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
|
||
" <td>b6a360c5f84595940c5774f13fd39cc3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
|
||
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>ecoles primaires rennes</td>\n",
|
||
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
|
||
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
|
||
" <td>ca8649dd64c240d118f60b07d11a7053</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>Angers Nantes Opéra</td>\n",
|
||
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
|
||
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
|
||
" <td>f8f500f937fe312542399299cdc13f7e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>Opéras</td>\n",
|
||
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
|
||
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
|
||
" <td>22eb2c616983ec7b54a093f84b230505</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>7</td>\n",
|
||
" <td>Ministère de la Culture</td>\n",
|
||
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
|
||
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
|
||
" <td>1b8c5c08fde000d90905a3d14af7763d</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8</td>\n",
|
||
" <td>Orchestres</td>\n",
|
||
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
|
||
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
|
||
" <td>7c2aee0c80642d7e325a450f2dec45e5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>Cooperative</td>\n",
|
||
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
|
||
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
|
||
" <td>6c88c36ffaab88d255865aa3111d7686</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Théâtres</td>\n",
|
||
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
|
||
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
|
||
" <td>b2c19672df82021702b79482c8cda85a</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>11</td>\n",
|
||
" <td>La co[opera]tive</td>\n",
|
||
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
|
||
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
|
||
" <td>5dbaa3a1f278c0fcf981d447ad20957a</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>Ville de Rennes</td>\n",
|
||
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
|
||
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
|
||
" <td>bc483d04d9c3a08f167a3ce64366ca72</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>13</td>\n",
|
||
" <td>Ensembles en résidence</td>\n",
|
||
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
|
||
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
|
||
" <td>e70635e771de13268dccf02bb2abfaf9</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>14</td>\n",
|
||
" <td>Ministère</td>\n",
|
||
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
|
||
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
|
||
" <td>a3f0582853fd19f5b57e3651f8a20e7a</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>15</td>\n",
|
||
" <td>Rennes métropole</td>\n",
|
||
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
|
||
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
|
||
" <td>e98b8db5941b96c29c353b6f2f502055</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>16</td>\n",
|
||
" <td>Ville de Rennes - équipements culturels</td>\n",
|
||
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
|
||
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
|
||
" <td>a44edffc7edb852982efa7f4aa6d0e25</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>17</td>\n",
|
||
" <td>Structures culturelles rennaises</td>\n",
|
||
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
|
||
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
|
||
" <td>241550517e4e3b1c926e9aeab0f621cd</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>18</td>\n",
|
||
" <td>Université Rennes 2</td>\n",
|
||
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
|
||
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
|
||
" <td>4057c5cee51c4e10aa819f0cf48adc3f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>19</td>\n",
|
||
" <td>Centres chorégraphiques nationaux</td>\n",
|
||
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
|
||
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
|
||
" <td>41e75941dfb766365498d917abe0102f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>20</td>\n",
|
||
" <td>Télévision</td>\n",
|
||
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
|
||
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
|
||
" <td>36d6409c539dd79c1f3af8c5948603eb</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>21</td>\n",
|
||
" <td>structures culturelles nationales</td>\n",
|
||
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
|
||
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
|
||
" <td>5311cf7e42aac53289e1c4a338d5cfa4</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name \\\n",
|
||
"0 2 ens-écoles \n",
|
||
"1 1 NaN \n",
|
||
"2 4 ecoles primaires rennes \n",
|
||
"3 5 Angers Nantes Opéra \n",
|
||
"4 6 Opéras \n",
|
||
"5 7 Ministère de la Culture \n",
|
||
"6 8 Orchestres \n",
|
||
"7 9 Cooperative \n",
|
||
"8 10 Théâtres \n",
|
||
"9 11 La co[opera]tive \n",
|
||
"10 12 Ville de Rennes \n",
|
||
"11 13 Ensembles en résidence \n",
|
||
"12 14 Ministère \n",
|
||
"13 15 Rennes métropole \n",
|
||
"14 16 Ville de Rennes - équipements culturels \n",
|
||
"15 17 Structures culturelles rennaises \n",
|
||
"16 18 Université Rennes 2 \n",
|
||
"17 19 Centres chorégraphiques nationaux \n",
|
||
"18 20 Télévision \n",
|
||
"19 21 structures culturelles nationales \n",
|
||
"\n",
|
||
" created_at updated_at \\\n",
|
||
"0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n",
|
||
"1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n",
|
||
"2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n",
|
||
"3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n",
|
||
"4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n",
|
||
"5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n",
|
||
"6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n",
|
||
"7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n",
|
||
"8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n",
|
||
"9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n",
|
||
"10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n",
|
||
"11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n",
|
||
"12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n",
|
||
"13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n",
|
||
"14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n",
|
||
"15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n",
|
||
"16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n",
|
||
"17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n",
|
||
"18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n",
|
||
"19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 b6a360c5f84595940c5774f13fd39cc3 \n",
|
||
"1 d41d8cd98f00b204e9800998ecf8427e \n",
|
||
"2 ca8649dd64c240d118f60b07d11a7053 \n",
|
||
"3 f8f500f937fe312542399299cdc13f7e \n",
|
||
"4 22eb2c616983ec7b54a093f84b230505 \n",
|
||
"5 1b8c5c08fde000d90905a3d14af7763d \n",
|
||
"6 7c2aee0c80642d7e325a450f2dec45e5 \n",
|
||
"7 6c88c36ffaab88d255865aa3111d7686 \n",
|
||
"8 b2c19672df82021702b79482c8cda85a \n",
|
||
"9 5dbaa3a1f278c0fcf981d447ad20957a \n",
|
||
"10 bc483d04d9c3a08f167a3ce64366ca72 \n",
|
||
"11 e70635e771de13268dccf02bb2abfaf9 \n",
|
||
"12 a3f0582853fd19f5b57e3651f8a20e7a \n",
|
||
"13 e98b8db5941b96c29c353b6f2f502055 \n",
|
||
"14 a44edffc7edb852982efa7f4aa6d0e25 \n",
|
||
"15 241550517e4e3b1c926e9aeab0f621cd \n",
|
||
"16 4057c5cee51c4e10aa819f0cf48adc3f \n",
|
||
"17 41e75941dfb766365498d917abe0102f \n",
|
||
"18 36d6409c539dd79c1f3af8c5948603eb \n",
|
||
"19 5311cf7e42aac53289e1c4a338d5cfa4 "
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n",
|
||
"(179, 5)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 179 entries, 0 to 178\n",
|
||
"Data columns (total 5 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 179 non-null int64 \n",
|
||
" 1 structure_id 179 non-null int64 \n",
|
||
" 2 tag_id 179 non-null int64 \n",
|
||
" 3 created_at 179 non-null object\n",
|
||
" 4 updated_at 179 non-null object\n",
|
||
"dtypes: int64(3), object(2)\n",
|
||
"memory usage: 7.1+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Structure = clients\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(structure_tag_mappings.columns)\n",
|
||
"print(structure_tag_mappings.shape)\n",
|
||
"structure_tag_mappings.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>123</td>\n",
|
||
" <td>187</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
|
||
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
|
||
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>174</th>\n",
|
||
" <td>184</td>\n",
|
||
" <td>236</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
|
||
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>175</th>\n",
|
||
" <td>185</td>\n",
|
||
" <td>237</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
|
||
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>176</th>\n",
|
||
" <td>186</td>\n",
|
||
" <td>238</td>\n",
|
||
" <td>19</td>\n",
|
||
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
|
||
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>177</th>\n",
|
||
" <td>187</td>\n",
|
||
" <td>239</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
|
||
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>178</th>\n",
|
||
" <td>188</td>\n",
|
||
" <td>240</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
|
||
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>179 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id structure_id tag_id created_at \\\n",
|
||
"0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n",
|
||
"1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n",
|
||
"2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n",
|
||
"3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n",
|
||
"4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n",
|
||
".. ... ... ... ... \n",
|
||
"174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n",
|
||
"175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n",
|
||
"176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n",
|
||
"177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n",
|
||
"178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n",
|
||
"\n",
|
||
" updated_at \n",
|
||
"0 2023-01-27 16:03:59.680222+01:00 \n",
|
||
"1 2021-05-07 15:24:19.872895+02:00 \n",
|
||
"2 2021-05-07 15:24:19.873830+02:00 \n",
|
||
"3 2021-05-07 15:24:19.874628+02:00 \n",
|
||
"4 2021-05-07 15:24:19.875421+02:00 \n",
|
||
".. ... \n",
|
||
"174 2023-02-17 16:35:25.041114+01:00 \n",
|
||
"175 2023-02-17 16:39:10.799478+01:00 \n",
|
||
"176 2023-02-17 16:53:21.098690+01:00 \n",
|
||
"177 2023-02-17 16:57:42.623481+01:00 \n",
|
||
"178 2023-02-17 16:59:22.067723+01:00 \n",
|
||
"\n",
|
||
"[179 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"structure_tag_mappings"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "41bf1529-5a7c-409e-9791-2024c08c11f0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
|
||
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
|
||
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
|
||
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
|
||
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
|
||
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
|
||
" 'average_purchase_delay', 'average_price_basket',\n",
|
||
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
|
||
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
|
||
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
|
||
" 'tenant_id'],\n",
|
||
" dtype='object')\n",
|
||
"(71307, 43)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 71307 entries, 0 to 71306\n",
|
||
"Data columns (total 43 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 71307 non-null int64 \n",
|
||
" 1 lastname 41045 non-null object \n",
|
||
" 2 firstname 39140 non-null object \n",
|
||
" 3 birthdate 18174 non-null object \n",
|
||
" 4 email 58203 non-null object \n",
|
||
" 5 street_id 71307 non-null int64 \n",
|
||
" 6 created_at 71307 non-null object \n",
|
||
" 7 updated_at 71307 non-null object \n",
|
||
" 8 civility 0 non-null float64\n",
|
||
" 9 is_partner 71307 non-null bool \n",
|
||
" 10 extra 0 non-null float64\n",
|
||
" 11 deleted_at 0 non-null float64\n",
|
||
" 12 reference 0 non-null float64\n",
|
||
" 13 gender 71307 non-null int64 \n",
|
||
" 14 is_email_true 71307 non-null bool \n",
|
||
" 15 extra_field 0 non-null float64\n",
|
||
" 16 identifier 71307 non-null object \n",
|
||
" 17 opt_in 71307 non-null bool \n",
|
||
" 18 structure_id 616 non-null float64\n",
|
||
" 19 note 451 non-null object \n",
|
||
" 20 profession 812 non-null object \n",
|
||
" 21 language 0 non-null float64\n",
|
||
" 22 mcp_contact_id 22417 non-null float64\n",
|
||
" 23 need_reload 71307 non-null bool \n",
|
||
" 24 last_buying_date 34040 non-null object \n",
|
||
" 25 max_price 34040 non-null float64\n",
|
||
" 26 ticket_sum 71307 non-null int64 \n",
|
||
" 27 average_price 68694 non-null float64\n",
|
||
" 28 fidelity 71307 non-null int64 \n",
|
||
" 29 average_purchase_delay 34040 non-null float64\n",
|
||
" 30 average_price_basket 34040 non-null float64\n",
|
||
" 31 average_ticket_basket 34040 non-null float64\n",
|
||
" 32 total_price 36653 non-null float64\n",
|
||
" 33 preferred_category 0 non-null float64\n",
|
||
" 34 preferred_supplier 0 non-null float64\n",
|
||
" 35 preferred_formula 0 non-null float64\n",
|
||
" 36 purchase_count 71307 non-null int64 \n",
|
||
" 37 first_buying_date 34040 non-null object \n",
|
||
" 38 last_visiting_date 0 non-null float64\n",
|
||
" 39 zipcode 33756 non-null object \n",
|
||
" 40 country 39910 non-null object \n",
|
||
" 41 age 18174 non-null float64\n",
|
||
" 42 tenant_id 71307 non-null int64 \n",
|
||
"dtypes: bool(4), float64(19), int64(7), object(13)\n",
|
||
"memory usage: 21.5+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Tags = clients\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(customersplus.columns)\n",
|
||
"print(customersplus.shape)\n",
|
||
"customersplus.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>lastname</th>\n",
|
||
" <th>firstname</th>\n",
|
||
" <th>birthdate</th>\n",
|
||
" <th>email</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>civility</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>preferred_category</th>\n",
|
||
" <th>preferred_supplier</th>\n",
|
||
" <th>preferred_formula</th>\n",
|
||
" <th>purchase_count</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>last_visiting_date</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>286834</td>\n",
|
||
" <td>lastname286834</td>\n",
|
||
" <td>firstname286834</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>email286834</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
|
||
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>330695</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>email330695</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-07-16 04:10:34.135134+02:00</td>\n",
|
||
" <td>2022-07-16 04:10:34.156704+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>330978</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>email330978</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-07-21 22:14:09.811721+02:00</td>\n",
|
||
" <td>2022-07-21 22:14:09.836051+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>338697</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>email338697</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-09-15 19:02:03.950536+02:00</td>\n",
|
||
" <td>2022-09-15 19:02:03.985642+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>338726</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>email338726</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2022-09-16 01:24:40.719882+02:00</td>\n",
|
||
" <td>2022-09-16 01:24:40.742753+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>71302</th>\n",
|
||
" <td>27105</td>\n",
|
||
" <td>lastname27105</td>\n",
|
||
" <td>firstname27105</td>\n",
|
||
" <td>1957-01-26</td>\n",
|
||
" <td>email27105</td>\n",
|
||
" <td>205024</td>\n",
|
||
" <td>2021-04-22 15:12:59.986534+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:31.613235+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-12-31 18:56:57+01:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>35700</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>66.0</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>71303</th>\n",
|
||
" <td>27108</td>\n",
|
||
" <td>lastname27108</td>\n",
|
||
" <td>firstname27108</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>205024</td>\n",
|
||
" <td>2021-04-22 15:12:59.989197+02:00</td>\n",
|
||
" <td>2023-09-12 18:27:34.380843+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2015-12-29 14:51:46+01:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>35700</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>71304</th>\n",
|
||
" <td>27110</td>\n",
|
||
" <td>lastname27110</td>\n",
|
||
" <td>firstname27110</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2021-04-22 15:12:59.991029+02:00</td>\n",
|
||
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2018-12-31 19:12:59+01:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>71305</th>\n",
|
||
" <td>10607</td>\n",
|
||
" <td>lastname10607</td>\n",
|
||
" <td>firstname10607</td>\n",
|
||
" <td>1963-01-04</td>\n",
|
||
" <td>email10607</td>\n",
|
||
" <td>313332</td>\n",
|
||
" <td>2021-04-22 14:56:45.742226+02:00</td>\n",
|
||
" <td>2023-09-12 17:55:17.723195+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>26</td>\n",
|
||
" <td>2015-10-10 14:11:21+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>35850</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>71306</th>\n",
|
||
" <td>19095</td>\n",
|
||
" <td>lastname19095</td>\n",
|
||
" <td>firstname19095</td>\n",
|
||
" <td>1979-07-16</td>\n",
|
||
" <td>email19095</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" <td>44.0</td>\n",
|
||
" <td>1556</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>71307 rows × 43 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id lastname firstname birthdate email \\\n",
|
||
"0 286834 lastname286834 firstname286834 NaN email286834 \n",
|
||
"1 330695 NaN NaN NaN email330695 \n",
|
||
"2 330978 NaN NaN NaN email330978 \n",
|
||
"3 338697 NaN NaN NaN email338697 \n",
|
||
"4 338726 NaN NaN NaN email338726 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n",
|
||
"71303 27108 lastname27108 firstname27108 NaN NaN \n",
|
||
"71304 27110 lastname27110 firstname27110 NaN NaN \n",
|
||
"71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n",
|
||
"71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
||
"\n",
|
||
" street_id created_at \\\n",
|
||
"0 6 2022-05-19 10:09:09.361137+02:00 \n",
|
||
"1 1 2022-07-16 04:10:34.135134+02:00 \n",
|
||
"2 1 2022-07-21 22:14:09.811721+02:00 \n",
|
||
"3 1 2022-09-15 19:02:03.950536+02:00 \n",
|
||
"4 1 2022-09-16 01:24:40.719882+02:00 \n",
|
||
"... ... ... \n",
|
||
"71302 205024 2021-04-22 15:12:59.986534+02:00 \n",
|
||
"71303 205024 2021-04-22 15:12:59.989197+02:00 \n",
|
||
"71304 6 2021-04-22 15:12:59.991029+02:00 \n",
|
||
"71305 313332 2021-04-22 14:56:45.742226+02:00 \n",
|
||
"71306 6 2021-04-22 15:06:30.120537+02:00 \n",
|
||
"\n",
|
||
" updated_at civility is_partner ... \\\n",
|
||
"0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n",
|
||
"1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n",
|
||
"2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n",
|
||
"3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n",
|
||
"4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n",
|
||
"... ... ... ... ... \n",
|
||
"71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n",
|
||
"71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n",
|
||
"71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
|
||
"71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n",
|
||
"71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
||
"\n",
|
||
" preferred_category preferred_supplier preferred_formula \\\n",
|
||
"0 NaN NaN NaN \n",
|
||
"1 NaN NaN NaN \n",
|
||
"2 NaN NaN NaN \n",
|
||
"3 NaN NaN NaN \n",
|
||
"4 NaN NaN NaN \n",
|
||
"... ... ... ... \n",
|
||
"71302 NaN NaN NaN \n",
|
||
"71303 NaN NaN NaN \n",
|
||
"71304 NaN NaN NaN \n",
|
||
"71305 NaN NaN NaN \n",
|
||
"71306 NaN NaN NaN \n",
|
||
"\n",
|
||
" purchase_count first_buying_date last_visiting_date zipcode \\\n",
|
||
"0 0 NaN NaN NaN \n",
|
||
"1 0 NaN NaN NaN \n",
|
||
"2 0 NaN NaN NaN \n",
|
||
"3 0 NaN NaN NaN \n",
|
||
"4 0 NaN NaN NaN \n",
|
||
"... ... ... ... ... \n",
|
||
"71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n",
|
||
"71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n",
|
||
"71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n",
|
||
"71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n",
|
||
"71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n",
|
||
"\n",
|
||
" country age tenant_id \n",
|
||
"0 fr NaN 1556 \n",
|
||
"1 NaN NaN 1556 \n",
|
||
"2 NaN NaN 1556 \n",
|
||
"3 NaN NaN 1556 \n",
|
||
"4 NaN NaN 1556 \n",
|
||
"... ... ... ... \n",
|
||
"71302 fr 66.0 1556 \n",
|
||
"71303 fr NaN 1556 \n",
|
||
"71304 fr NaN 1556 \n",
|
||
"71305 fr 60.0 1556 \n",
|
||
"71306 fr 44.0 1556 \n",
|
||
"\n",
|
||
"[71307 rows x 43 columns]"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"customersplus"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "8259ae6c-353f-43a6-add3-f974fac6e5d4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n",
|
||
" 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n",
|
||
" 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(318969, 11)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 318969 entries, 0 to 318968\n",
|
||
"Data columns (total 11 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 318969 non-null int64 \n",
|
||
" 1 number 318969 non-null object \n",
|
||
" 2 created_at 318969 non-null object \n",
|
||
" 3 updated_at 318969 non-null object \n",
|
||
" 4 purchase_id 318969 non-null int64 \n",
|
||
" 5 product_id 318969 non-null int64 \n",
|
||
" 6 is_from_subscription 318969 non-null bool \n",
|
||
" 7 type_of 318969 non-null int64 \n",
|
||
" 8 supplier_id 318969 non-null int64 \n",
|
||
" 9 barcode 0 non-null float64\n",
|
||
" 10 identifier 318969 non-null object \n",
|
||
"dtypes: bool(1), float64(1), int64(5), object(4)\n",
|
||
"memory usage: 24.6+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# tickets\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" tickets = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(tickets.columns)\n",
|
||
"print(tickets.shape)\n",
|
||
"tickets.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "f54830cb-1f95-4f71-9b04-358c745fb454",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>number</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>purchase_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>type_of</th>\n",
|
||
" <th>supplier_id</th>\n",
|
||
" <th>barcode</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2119081</td>\n",
|
||
" <td>1433_136_212_68356</td>\n",
|
||
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:45.396336+02:00</td>\n",
|
||
" <td>861764</td>\n",
|
||
" <td>209879</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>f694c255855ce5643c6fcc7fed5e9237</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2119082</td>\n",
|
||
" <td>1433_136_194_68356</td>\n",
|
||
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:45.409056+02:00</td>\n",
|
||
" <td>861763</td>\n",
|
||
" <td>209879</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>838d6101db2fc8bc80536d8b91b49859</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2119083</td>\n",
|
||
" <td>33158_158_343_68357</td>\n",
|
||
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:45.409824+02:00</td>\n",
|
||
" <td>861769</td>\n",
|
||
" <td>209880</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>8a8d938d66a4dc57bcb44c2773c6fdfa</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2119084</td>\n",
|
||
" <td>33158_158_297_68357</td>\n",
|
||
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:45.410447+02:00</td>\n",
|
||
" <td>861767</td>\n",
|
||
" <td>209880</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b7a3dd0794c0957c942d45b8913e5b96</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2119085</td>\n",
|
||
" <td>33158_158_318_68357</td>\n",
|
||
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:45.411059+02:00</td>\n",
|
||
" <td>861768</td>\n",
|
||
" <td>209880</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>d7ea7e443581ebe520dd13f6cad31af7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318964</th>\n",
|
||
" <td>2564021</td>\n",
|
||
" <td>44247_204_239_89278</td>\n",
|
||
" <td>2023-09-12 18:59:48.750953+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:48.750953+02:00</td>\n",
|
||
" <td>1244281</td>\n",
|
||
" <td>210158</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>82c9af8b2167f7ac34a5e834242b0239</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318965</th>\n",
|
||
" <td>2564022</td>\n",
|
||
" <td>44247_204_299_89278</td>\n",
|
||
" <td>2023-09-12 18:59:48.751441+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:48.751441+02:00</td>\n",
|
||
" <td>1244284</td>\n",
|
||
" <td>210158</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>235e8e608f066cb72949bbd397d0a76f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318966</th>\n",
|
||
" <td>2564023</td>\n",
|
||
" <td>44247_204_259_89278</td>\n",
|
||
" <td>2023-09-12 18:59:48.751924+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:48.751924+02:00</td>\n",
|
||
" <td>1244282</td>\n",
|
||
" <td>210158</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>ec22fa828931f030f7e79a4cc5478c4b</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318967</th>\n",
|
||
" <td>2564024</td>\n",
|
||
" <td>44247_204_279_89278</td>\n",
|
||
" <td>2023-09-12 18:59:48.752425+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:48.752425+02:00</td>\n",
|
||
" <td>1244283</td>\n",
|
||
" <td>210158</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>31ec4deaf718e04caf193e1ff8d621ef</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>318968</th>\n",
|
||
" <td>2513156</td>\n",
|
||
" <td>4854_178_2847_89170</td>\n",
|
||
" <td>2023-09-12 18:52:20.331807+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:48.752904+02:00</td>\n",
|
||
" <td>1244285</td>\n",
|
||
" <td>261922</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1702</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>48aef9efab29bfb1537656908863bcc1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>318969 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id number created_at \\\n",
|
||
"0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n",
|
||
"1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n",
|
||
"2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n",
|
||
"3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n",
|
||
"4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n",
|
||
"... ... ... ... \n",
|
||
"318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n",
|
||
"318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n",
|
||
"318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n",
|
||
"318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n",
|
||
"318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n",
|
||
"\n",
|
||
" updated_at purchase_id product_id \\\n",
|
||
"0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n",
|
||
"1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n",
|
||
"2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n",
|
||
"3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n",
|
||
"4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n",
|
||
"... ... ... ... \n",
|
||
"318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n",
|
||
"318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n",
|
||
"318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n",
|
||
"318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n",
|
||
"318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n",
|
||
"\n",
|
||
" is_from_subscription type_of supplier_id barcode \\\n",
|
||
"0 False 1 1702 NaN \n",
|
||
"1 False 1 1702 NaN \n",
|
||
"2 False 1 1702 NaN \n",
|
||
"3 False 1 1702 NaN \n",
|
||
"4 False 1 1702 NaN \n",
|
||
"... ... ... ... ... \n",
|
||
"318964 False 1 1702 NaN \n",
|
||
"318965 False 1 1702 NaN \n",
|
||
"318966 False 1 1702 NaN \n",
|
||
"318967 False 1 1702 NaN \n",
|
||
"318968 False 3 1702 NaN \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 f694c255855ce5643c6fcc7fed5e9237 \n",
|
||
"1 838d6101db2fc8bc80536d8b91b49859 \n",
|
||
"2 8a8d938d66a4dc57bcb44c2773c6fdfa \n",
|
||
"3 b7a3dd0794c0957c942d45b8913e5b96 \n",
|
||
"4 d7ea7e443581ebe520dd13f6cad31af7 \n",
|
||
"... ... \n",
|
||
"318964 82c9af8b2167f7ac34a5e834242b0239 \n",
|
||
"318965 235e8e608f066cb72949bbd397d0a76f \n",
|
||
"318966 ec22fa828931f030f7e79a4cc5478c4b \n",
|
||
"318967 31ec4deaf718e04caf193e1ff8d621ef \n",
|
||
"318968 48aef9efab29bfb1537656908863bcc1 \n",
|
||
"\n",
|
||
"[318969 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tickets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "ad743347-33d1-41f0-852d-f9e6354f82ed",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([1, 3, 0])"
|
||
]
|
||
},
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tickets['type_of'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "b88808fe-3b4e-49ed-9885-d52910b6f211",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Types d'évenement et client"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n",
|
||
" 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n",
|
||
" 'facility_key_id', 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(403, 12)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 403 entries, 0 to 402\n",
|
||
"Data columns (total 12 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 403 non-null int64 \n",
|
||
" 1 created_at 403 non-null object\n",
|
||
" 2 updated_at 403 non-null object\n",
|
||
" 3 season_id 403 non-null int64 \n",
|
||
" 4 facility_id 403 non-null int64 \n",
|
||
" 5 name 403 non-null object\n",
|
||
" 6 event_type_id 403 non-null int64 \n",
|
||
" 7 manual_added 403 non-null bool \n",
|
||
" 8 is_display 403 non-null bool \n",
|
||
" 9 event_type_key_id 403 non-null int64 \n",
|
||
" 10 facility_key_id 403 non-null int64 \n",
|
||
" 11 identifier 403 non-null object\n",
|
||
"dtypes: bool(2), int64(6), object(4)\n",
|
||
"memory usage: 32.4+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Evenement = events.csv\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" events = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(events.columns)\n",
|
||
"print(events.shape)\n",
|
||
"events.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "19706610-9e90-4e6f-8bd0-da124b87cff7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>season_id</th>\n",
|
||
" <th>facility_id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>event_type_id</th>\n",
|
||
" <th>manual_added</th>\n",
|
||
" <th>is_display</th>\n",
|
||
" <th>event_type_key_id</th>\n",
|
||
" <th>facility_key_id</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>20367</td>\n",
|
||
" <td>2023-09-13 03:42:45.214293+02:00</td>\n",
|
||
" <td>2023-09-13 03:54:30.086969+02:00</td>\n",
|
||
" <td>1865</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>marelle</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>26d1e9a4acad18b9cf79244334c86c93</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>20371</td>\n",
|
||
" <td>2023-09-13 03:42:45.218728+02:00</td>\n",
|
||
" <td>2023-09-13 03:54:30.103943+02:00</td>\n",
|
||
" <td>1865</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>dialogues</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>60356fc5e8ed6c9c1be9c5ec67e77766</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>20570</td>\n",
|
||
" <td>2023-10-05 04:48:29.374504+02:00</td>\n",
|
||
" <td>2023-10-05 04:48:36.562528+02:00</td>\n",
|
||
" <td>1865</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>les grandes epopees</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>f8ab088e06252bf34e1b12ad2ce1a403</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>20757</td>\n",
|
||
" <td>2023-11-01 03:55:20.846196+01:00</td>\n",
|
||
" <td>2023-11-01 03:55:28.412457+01:00</td>\n",
|
||
" <td>1865</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>scolaire marelle</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>447fa80f9a793b7587bb85ebbda6442c</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>20364</td>\n",
|
||
" <td>2023-09-13 03:42:45.196791+02:00</td>\n",
|
||
" <td>2023-09-13 03:54:30.075456+02:00</td>\n",
|
||
" <td>1865</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>le couronnement de poppee</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>3b37f5d2cd354cbc422868621ac7ebc2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>398</th>\n",
|
||
" <td>15603</td>\n",
|
||
" <td>2023-09-12 17:42:25.327618+02:00</td>\n",
|
||
" <td>2023-09-12 19:00:00.893400+02:00</td>\n",
|
||
" <td>1706</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>marelle</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>fde88b72fb82b1fe42fbbfbfc3d6b4d3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>399</th>\n",
|
||
" <td>15621</td>\n",
|
||
" <td>2023-09-12 17:42:25.335792+02:00</td>\n",
|
||
" <td>2023-09-12 19:00:00.899622+02:00</td>\n",
|
||
" <td>1708</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>cartes d'adhesion</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>051b96aad2b720bad4450a59ed7dfbf6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>400</th>\n",
|
||
" <td>15740</td>\n",
|
||
" <td>2023-09-12 17:47:05.112101+02:00</td>\n",
|
||
" <td>2023-09-12 19:00:00.906123+02:00</td>\n",
|
||
" <td>1711</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>repetition le medecin malgre lui</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>addd6885bea5ddf60ec3539dfc3e79e8</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>401</th>\n",
|
||
" <td>15520</td>\n",
|
||
" <td>2023-09-12 17:42:25.290280+02:00</td>\n",
|
||
" <td>2023-09-12 19:00:00.835625+02:00</td>\n",
|
||
" <td>1708</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>opera au village</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>94f250d10d4a56358ceab23b384439ff</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>402</th>\n",
|
||
" <td>15439</td>\n",
|
||
" <td>2023-09-12 17:42:25.252747+02:00</td>\n",
|
||
" <td>2023-09-12 19:00:00.735990+02:00</td>\n",
|
||
" <td>1708</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>florilege</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1055</td>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>4f015946bcbd856aa573cadb7ac42b9f</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>403 rows × 12 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id created_at \\\n",
|
||
"0 20367 2023-09-13 03:42:45.214293+02:00 \n",
|
||
"1 20371 2023-09-13 03:42:45.218728+02:00 \n",
|
||
"2 20570 2023-10-05 04:48:29.374504+02:00 \n",
|
||
"3 20757 2023-11-01 03:55:20.846196+01:00 \n",
|
||
"4 20364 2023-09-13 03:42:45.196791+02:00 \n",
|
||
".. ... ... \n",
|
||
"398 15603 2023-09-12 17:42:25.327618+02:00 \n",
|
||
"399 15621 2023-09-12 17:42:25.335792+02:00 \n",
|
||
"400 15740 2023-09-12 17:47:05.112101+02:00 \n",
|
||
"401 15520 2023-09-12 17:42:25.290280+02:00 \n",
|
||
"402 15439 2023-09-12 17:42:25.252747+02:00 \n",
|
||
"\n",
|
||
" updated_at season_id facility_id \\\n",
|
||
"0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n",
|
||
"1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n",
|
||
"2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n",
|
||
"3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n",
|
||
"4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n",
|
||
".. ... ... ... \n",
|
||
"398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n",
|
||
"399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n",
|
||
"400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n",
|
||
"401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n",
|
||
"402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n",
|
||
"\n",
|
||
" name event_type_id manual_added \\\n",
|
||
"0 marelle 1055 False \n",
|
||
"1 dialogues 1055 False \n",
|
||
"2 les grandes epopees 1055 False \n",
|
||
"3 scolaire marelle 1055 False \n",
|
||
"4 le couronnement de poppee 1055 False \n",
|
||
".. ... ... ... \n",
|
||
"398 marelle 1055 False \n",
|
||
"399 cartes d'adhesion 1055 False \n",
|
||
"400 repetition le medecin malgre lui 1055 False \n",
|
||
"401 opera au village 1055 False \n",
|
||
"402 florilege 1055 False \n",
|
||
"\n",
|
||
" is_display event_type_key_id facility_key_id \\\n",
|
||
"0 True 1055 1054 \n",
|
||
"1 True 1055 1054 \n",
|
||
"2 True 1055 1054 \n",
|
||
"3 True 1055 1054 \n",
|
||
"4 True 1055 1054 \n",
|
||
".. ... ... ... \n",
|
||
"398 True 1055 1054 \n",
|
||
"399 True 1055 1054 \n",
|
||
"400 True 1055 1054 \n",
|
||
"401 True 1055 1054 \n",
|
||
"402 True 1055 1054 \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 26d1e9a4acad18b9cf79244334c86c93 \n",
|
||
"1 60356fc5e8ed6c9c1be9c5ec67e77766 \n",
|
||
"2 f8ab088e06252bf34e1b12ad2ce1a403 \n",
|
||
"3 447fa80f9a793b7587bb85ebbda6442c \n",
|
||
"4 3b37f5d2cd354cbc422868621ac7ebc2 \n",
|
||
".. ... \n",
|
||
"398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n",
|
||
"399 051b96aad2b720bad4450a59ed7dfbf6 \n",
|
||
"400 addd6885bea5ddf60ec3539dfc3e79e8 \n",
|
||
"401 94f250d10d4a56358ceab23b384439ff \n",
|
||
"402 4f015946bcbd856aa573cadb7ac42b9f \n",
|
||
"\n",
|
||
"[403 rows x 12 columns]"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"events"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "6cb04679-26e7-4ed8-bfc1-42285da96374",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"357"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"events['name'].nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n",
|
||
" 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
|
||
" 'is_display', 'representation_type_id', 'expected_filling',\n",
|
||
" 'max_filling', 'extra_field', 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(996, 16)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 996 entries, 0 to 995\n",
|
||
"Data columns (total 16 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 996 non-null int64 \n",
|
||
" 1 serial 0 non-null float64\n",
|
||
" 2 event_id 996 non-null int64 \n",
|
||
" 3 created_at 996 non-null object \n",
|
||
" 4 updated_at 996 non-null object \n",
|
||
" 5 start_date_time 996 non-null object \n",
|
||
" 6 open 996 non-null bool \n",
|
||
" 7 satisfaction 0 non-null float64\n",
|
||
" 8 end_date_time 996 non-null object \n",
|
||
" 9 name 0 non-null float64\n",
|
||
" 10 is_display 996 non-null bool \n",
|
||
" 11 representation_type_id 0 non-null float64\n",
|
||
" 12 expected_filling 24 non-null float64\n",
|
||
" 13 max_filling 24 non-null float64\n",
|
||
" 14 extra_field 0 non-null float64\n",
|
||
" 15 identifier 996 non-null object \n",
|
||
"dtypes: bool(2), float64(7), int64(2), object(5)\n",
|
||
"memory usage: 111.0+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Représentation des évenements = representations.csv\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" representations = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(representations.columns)\n",
|
||
"print(representations.shape)\n",
|
||
"representations.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>serial</th>\n",
|
||
" <th>event_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>start_date_time</th>\n",
|
||
" <th>open</th>\n",
|
||
" <th>satisfaction</th>\n",
|
||
" <th>end_date_time</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>is_display</th>\n",
|
||
" <th>representation_type_id</th>\n",
|
||
" <th>expected_filling</th>\n",
|
||
" <th>max_filling</th>\n",
|
||
" <th>extra_field</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>44351</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>20371</td>\n",
|
||
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
|
||
" <td>2023-09-13 03:42:45.245879+02:00</td>\n",
|
||
" <td>2023-12-21 20:00:00+01:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>33520762e8cc28982e3841cbc2be8ce2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>45497</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>20757</td>\n",
|
||
" <td>2023-11-01 03:55:20.875712+01:00</td>\n",
|
||
" <td>2023-11-01 03:55:20.875712+01:00</td>\n",
|
||
" <td>2023-11-28 10:00:00+01:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>5c34b84e3d11276e0995d984c94cd28d</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>44383</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>20383</td>\n",
|
||
" <td>2023-09-13 10:41:08.964302+02:00</td>\n",
|
||
" <td>2023-09-13 10:41:08.964302+02:00</td>\n",
|
||
" <td>2023-06-04 17:00:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>bf3c65a1dfefbd747dcc2360e6887eac</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>44384</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>20383</td>\n",
|
||
" <td>2023-09-13 10:41:08.972401+02:00</td>\n",
|
||
" <td>2023-09-13 10:41:08.972401+02:00</td>\n",
|
||
" <td>2023-06-03 17:30:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b0e69ae8b78ebab3066aac83de22d239</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>44385</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>20384</td>\n",
|
||
" <td>2023-09-13 10:41:08.973290+02:00</td>\n",
|
||
" <td>2023-09-13 10:41:08.973290+02:00</td>\n",
|
||
" <td>2023-06-03 16:15:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9fb91c8b1cf9e444111c511e212ac5c1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>991</th>\n",
|
||
" <td>33894</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15647</td>\n",
|
||
" <td>2023-09-12 17:42:25.564297+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.564297+02:00</td>\n",
|
||
" <td>2022-11-08 20:00:00+01:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>44bbcecfd007ceaad05805391beccabb</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>992</th>\n",
|
||
" <td>33873</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15640</td>\n",
|
||
" <td>2023-09-12 17:42:25.554863+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.554863+02:00</td>\n",
|
||
" <td>2022-11-14 20:00:00+01:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>151edbec8e0a3cd80071038e857f3493</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>993</th>\n",
|
||
" <td>33610</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15520</td>\n",
|
||
" <td>2023-09-12 17:42:25.442979+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.442979+02:00</td>\n",
|
||
" <td>2023-06-19 18:00:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9e9e38d527427e1b6f67e0c3f12b82fc</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>994</th>\n",
|
||
" <td>33953</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15520</td>\n",
|
||
" <td>2023-09-12 17:42:25.590746+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.590746+02:00</td>\n",
|
||
" <td>2023-06-19 20:00:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7bf0978aabb6cac1bb4cd2784afb2b6b</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>995</th>\n",
|
||
" <td>33639</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15533</td>\n",
|
||
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.455708+02:00</td>\n",
|
||
" <td>2023-04-15 17:30:00+02:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1901-01-01 00:09:21+00:09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fae68f1e09710ec8747957af6e22f61d</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>996 rows × 16 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id serial event_id created_at \\\n",
|
||
"0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
|
||
"1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n",
|
||
"2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n",
|
||
"3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n",
|
||
"4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n",
|
||
".. ... ... ... ... \n",
|
||
"991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n",
|
||
"992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n",
|
||
"993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n",
|
||
"994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n",
|
||
"995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
|
||
"\n",
|
||
" updated_at start_date_time open \\\n",
|
||
"0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
|
||
"1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n",
|
||
"2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n",
|
||
"3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n",
|
||
"4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n",
|
||
".. ... ... ... \n",
|
||
"991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n",
|
||
"992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n",
|
||
"993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n",
|
||
"994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n",
|
||
"995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
|
||
"\n",
|
||
" satisfaction end_date_time name is_display \\\n",
|
||
"0 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"1 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"2 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"3 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"4 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
".. ... ... ... ... \n",
|
||
"991 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"992 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"993 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"994 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"995 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
|
||
"\n",
|
||
" representation_type_id expected_filling max_filling extra_field \\\n",
|
||
"0 NaN 550.0 550.0 NaN \n",
|
||
"1 NaN NaN NaN NaN \n",
|
||
"2 NaN NaN NaN NaN \n",
|
||
"3 NaN NaN NaN NaN \n",
|
||
"4 NaN NaN NaN NaN \n",
|
||
".. ... ... ... ... \n",
|
||
"991 NaN NaN NaN NaN \n",
|
||
"992 NaN NaN NaN NaN \n",
|
||
"993 NaN NaN NaN NaN \n",
|
||
"994 NaN NaN NaN NaN \n",
|
||
"995 NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 33520762e8cc28982e3841cbc2be8ce2 \n",
|
||
"1 5c34b84e3d11276e0995d984c94cd28d \n",
|
||
"2 bf3c65a1dfefbd747dcc2360e6887eac \n",
|
||
"3 b0e69ae8b78ebab3066aac83de22d239 \n",
|
||
"4 9fb91c8b1cf9e444111c511e212ac5c1 \n",
|
||
".. ... \n",
|
||
"991 44bbcecfd007ceaad05805391beccabb \n",
|
||
"992 151edbec8e0a3cd80071038e857f3493 \n",
|
||
"993 9e9e38d527427e1b6f67e0c3f12b82fc \n",
|
||
"994 7bf0978aabb6cac1bb4cd2784afb2b6b \n",
|
||
"995 fae68f1e09710ec8747957af6e22f61d \n",
|
||
"\n",
|
||
"[996 rows x 16 columns]"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"representations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'amount', 'is_full_price', 'representation_id',\n",
|
||
" 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n",
|
||
" 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n",
|
||
" 'amount_consumption', 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(14648, 14)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 14648 entries, 0 to 14647\n",
|
||
"Data columns (total 14 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 14648 non-null int64 \n",
|
||
" 1 amount 14648 non-null float64\n",
|
||
" 2 is_full_price 14648 non-null bool \n",
|
||
" 3 representation_id 14648 non-null int64 \n",
|
||
" 4 pricing_formula_id 14648 non-null int64 \n",
|
||
" 5 created_at 14648 non-null object \n",
|
||
" 6 updated_at 14648 non-null object \n",
|
||
" 7 category_id 14648 non-null int64 \n",
|
||
" 8 apply_price 14648 non-null float64\n",
|
||
" 9 products_group_id 14648 non-null int64 \n",
|
||
" 10 product_pack_id 14648 non-null int64 \n",
|
||
" 11 extra_field 0 non-null float64\n",
|
||
" 12 amount_consumption 0 non-null float64\n",
|
||
" 13 identifier 14648 non-null object \n",
|
||
"dtypes: bool(1), float64(4), int64(6), object(3)\n",
|
||
"memory usage: 1.5+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Produits vendues = products.csv\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" products = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(products.columns)\n",
|
||
"print(products.shape)\n",
|
||
"products.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "34f1825d-148a-4a6e-88d6-61449fee3ee4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>amount</th>\n",
|
||
" <th>is_full_price</th>\n",
|
||
" <th>representation_id</th>\n",
|
||
" <th>pricing_formula_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>category_id</th>\n",
|
||
" <th>apply_price</th>\n",
|
||
" <th>products_group_id</th>\n",
|
||
" <th>product_pack_id</th>\n",
|
||
" <th>extra_field</th>\n",
|
||
" <th>amount_consumption</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>268325</td>\n",
|
||
" <td>18.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>44332</td>\n",
|
||
" <td>20477</td>\n",
|
||
" <td>2023-09-13 03:42:45.415594+02:00</td>\n",
|
||
" <td>2023-09-13 03:42:45.415594+02:00</td>\n",
|
||
" <td>4972</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>268108</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b823bbea3ba837da2ef8efaf1287272d</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>274118</td>\n",
|
||
" <td>36.8</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>44340</td>\n",
|
||
" <td>20502</td>\n",
|
||
" <td>2023-10-25 03:26:57.430694+02:00</td>\n",
|
||
" <td>2023-10-25 03:26:57.430694+02:00</td>\n",
|
||
" <td>4969</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>273901</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>81e8b7991f6948e3ef7cfe5011d13532</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>268338</td>\n",
|
||
" <td>39.1</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>44340</td>\n",
|
||
" <td>20497</td>\n",
|
||
" <td>2023-09-13 03:42:45.430942+02:00</td>\n",
|
||
" <td>2023-09-13 03:42:45.430942+02:00</td>\n",
|
||
" <td>4969</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>268121</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>be8bc0399db4d04aefa9f44afd4d5efa</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>209883</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33443</td>\n",
|
||
" <td>20475</td>\n",
|
||
" <td>2023-09-12 17:42:27.595998+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:27.595998+02:00</td>\n",
|
||
" <td>4970</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>209706</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>01a9eea5f8ad53491faa864bfac44183</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>268326</td>\n",
|
||
" <td>63.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>44333</td>\n",
|
||
" <td>20477</td>\n",
|
||
" <td>2023-09-13 03:42:45.417283+02:00</td>\n",
|
||
" <td>2023-09-13 03:42:45.417283+02:00</td>\n",
|
||
" <td>4969</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>268109</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>781a917ecfdabb14169701d7b143bbe4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14643</th>\n",
|
||
" <td>217878</td>\n",
|
||
" <td>33.6</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33919</td>\n",
|
||
" <td>20489</td>\n",
|
||
" <td>2023-09-12 17:51:11.572882+02:00</td>\n",
|
||
" <td>2023-09-12 17:51:11.572882+02:00</td>\n",
|
||
" <td>4971</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>217695</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>82bba69321466069411b3023343b44a4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14644</th>\n",
|
||
" <td>268315</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33919</td>\n",
|
||
" <td>20504</td>\n",
|
||
" <td>2023-09-12 18:59:29.995176+02:00</td>\n",
|
||
" <td>2023-09-12 18:59:29.995176+02:00</td>\n",
|
||
" <td>4969</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>268098</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>eae56a8eb0a4315c5713b2053103d595</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14645</th>\n",
|
||
" <td>210148</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33531</td>\n",
|
||
" <td>20473</td>\n",
|
||
" <td>2023-09-12 17:42:27.733260+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:27.733260+02:00</td>\n",
|
||
" <td>4975</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>209971</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>449f86c1ef2b478d3389f7d0e27d0e6b</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14646</th>\n",
|
||
" <td>212054</td>\n",
|
||
" <td>30.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33810</td>\n",
|
||
" <td>20473</td>\n",
|
||
" <td>2023-09-12 17:42:28.724681+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:28.724681+02:00</td>\n",
|
||
" <td>4972</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>211876</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2090203e2c0b58ea8f505089faee6d62</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14647</th>\n",
|
||
" <td>261922</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>33766</td>\n",
|
||
" <td>20488</td>\n",
|
||
" <td>2023-09-12 18:52:00.519838+02:00</td>\n",
|
||
" <td>2023-09-12 18:52:00.519838+02:00</td>\n",
|
||
" <td>4972</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>261709</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9139ee36a92bed766ae95372cca77336</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>14648 rows × 14 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id amount is_full_price representation_id pricing_formula_id \\\n",
|
||
"0 268325 18.0 False 44332 20477 \n",
|
||
"1 274118 36.8 False 44340 20502 \n",
|
||
"2 268338 39.1 False 44340 20497 \n",
|
||
"3 209883 0.0 False 33443 20475 \n",
|
||
"4 268326 63.0 False 44333 20477 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"14643 217878 33.6 False 33919 20489 \n",
|
||
"14644 268315 10.0 False 33919 20504 \n",
|
||
"14645 210148 5.0 False 33531 20473 \n",
|
||
"14646 212054 30.0 False 33810 20473 \n",
|
||
"14647 261922 21.0 False 33766 20488 \n",
|
||
"\n",
|
||
" created_at updated_at \\\n",
|
||
"0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n",
|
||
"1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n",
|
||
"2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n",
|
||
"3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n",
|
||
"4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n",
|
||
"... ... ... \n",
|
||
"14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n",
|
||
"14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n",
|
||
"14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n",
|
||
"14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n",
|
||
"14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n",
|
||
"\n",
|
||
" category_id apply_price products_group_id product_pack_id \\\n",
|
||
"0 4972 0.0 268108 1 \n",
|
||
"1 4969 0.0 273901 1 \n",
|
||
"2 4969 0.0 268121 1 \n",
|
||
"3 4970 0.0 209706 1 \n",
|
||
"4 4969 0.0 268109 1 \n",
|
||
"... ... ... ... ... \n",
|
||
"14643 4971 0.0 217695 1 \n",
|
||
"14644 4969 0.0 268098 1 \n",
|
||
"14645 4975 0.0 209971 1 \n",
|
||
"14646 4972 0.0 211876 1 \n",
|
||
"14647 4972 0.0 261709 1 \n",
|
||
"\n",
|
||
" extra_field amount_consumption identifier \n",
|
||
"0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n",
|
||
"1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n",
|
||
"2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n",
|
||
"3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n",
|
||
"4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n",
|
||
"... ... ... ... \n",
|
||
"14643 NaN NaN 82bba69321466069411b3023343b44a4 \n",
|
||
"14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n",
|
||
"14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n",
|
||
"14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n",
|
||
"14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n",
|
||
"\n",
|
||
"[14648 rows x 14 columns]"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"products"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "6735b338-26b5-479d-825d-677ea533dad5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n",
|
||
" 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(1, 7)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 1 entries, 0 to 0\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 1 non-null int64 \n",
|
||
" 1 name 0 non-null float64\n",
|
||
" 2 created_at 1 non-null object \n",
|
||
" 3 updated_at 1 non-null object \n",
|
||
" 4 street_id 1 non-null int64 \n",
|
||
" 5 fixed_capacity 0 non-null float64\n",
|
||
" 6 identifier 1 non-null object \n",
|
||
"dtypes: float64(2), int64(2), object(3)\n",
|
||
"memory usage: 184.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Lieu = facilities.csv\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" facilities = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(facilities.columns)\n",
|
||
"print(facilities.shape)\n",
|
||
"facilities.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>fixed_capacity</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1054</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2023-09-12 17:42:25.223064+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:25.223064+02:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name created_at \\\n",
|
||
"0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n",
|
||
"\n",
|
||
" updated_at street_id fixed_capacity \\\n",
|
||
"0 2023-09-12 17:42:25.223064+02:00 1 NaN \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 d41d8cd98f00b204e9800998ecf8427e "
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"facilities"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n",
|
||
" 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(9, 6)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 9 entries, 0 to 8\n",
|
||
"Data columns (total 6 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 9 non-null int64 \n",
|
||
" 1 name 9 non-null object \n",
|
||
" 2 created_at 9 non-null object \n",
|
||
" 3 updated_at 9 non-null object \n",
|
||
" 4 start_date_time 0 non-null float64\n",
|
||
" 5 identifier 9 non-null object \n",
|
||
"dtypes: float64(1), int64(1), object(4)\n",
|
||
"memory usage: 560.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Saisons = seasons.csv période sur deux années consécutives\n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" seasons = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(seasons.columns)\n",
|
||
"print(seasons.shape)\n",
|
||
"seasons.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n",
|
||
" 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n",
|
||
" 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n",
|
||
" dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"seasons['name'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n",
|
||
" 'number', 'identifier'],\n",
|
||
" dtype='object')\n",
|
||
"(410695, 7)\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 410695 entries, 0 to 410694\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 410695 non-null int64 \n",
|
||
" 1 purchase_date 410695 non-null object \n",
|
||
" 2 customer_id 410695 non-null int64 \n",
|
||
" 3 created_at 410695 non-null object \n",
|
||
" 4 updated_at 410695 non-null object \n",
|
||
" 5 number 0 non-null float64\n",
|
||
" 6 identifier 410695 non-null object \n",
|
||
"dtypes: float64(1), int64(2), object(4)\n",
|
||
"memory usage: 21.9+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Achats = purchases.csv \n",
|
||
"FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
|
||
"\n",
|
||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||
" purchases = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
"print(purchases.columns)\n",
|
||
"print(purchases.shape)\n",
|
||
"purchases.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>number</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>861761</td>\n",
|
||
" <td>2019-03-01 16:28:49+01:00</td>\n",
|
||
" <td>4966</td>\n",
|
||
" <td>2023-09-12 17:42:37.564150+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:37.564150+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>d20eb0c3a7efec0bbe338dee40dc3378</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>861762</td>\n",
|
||
" <td>2019-03-01 16:29:11+01:00</td>\n",
|
||
" <td>4966</td>\n",
|
||
" <td>2023-09-12 17:42:37.571159+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:37.571159+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>cff3abfc018517bce5ccfc58f5cacf40</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>861763</td>\n",
|
||
" <td>2019-03-01 16:29:17+01:00</td>\n",
|
||
" <td>4966</td>\n",
|
||
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:37.571646+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>e1155cf26b34f792bdb23e49244d7264</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>861764</td>\n",
|
||
" <td>2019-03-01 16:29:19+01:00</td>\n",
|
||
" <td>4966</td>\n",
|
||
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:37.572063+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>e8b95cc6a1a8b103ffa39755ce3bfc4d</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>861765</td>\n",
|
||
" <td>2019-03-01 16:32:08+01:00</td>\n",
|
||
" <td>405994</td>\n",
|
||
" <td>2023-09-12 17:42:37.572470+02:00</td>\n",
|
||
" <td>2023-09-12 17:42:37.572470+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1b763278914f1309e357abe5033a3f0f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>410690</th>\n",
|
||
" <td>1285964</td>\n",
|
||
" <td>2023-10-21 21:46:41+02:00</td>\n",
|
||
" <td>517309</td>\n",
|
||
" <td>2023-10-23 03:43:16.457501+02:00</td>\n",
|
||
" <td>2023-10-23 03:43:16.457501+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>72c4e90c2b151dcffc87b19ea8a0c4f1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>410691</th>\n",
|
||
" <td>1285965</td>\n",
|
||
" <td>2023-10-21 21:47:07+02:00</td>\n",
|
||
" <td>517309</td>\n",
|
||
" <td>2023-10-23 03:43:16.458458+02:00</td>\n",
|
||
" <td>2023-10-23 03:43:16.458458+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>ee65532087132145daa6154fbae050ea</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>410692</th>\n",
|
||
" <td>1285966</td>\n",
|
||
" <td>2023-10-21 21:47:20+02:00</td>\n",
|
||
" <td>517309</td>\n",
|
||
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
|
||
" <td>2023-10-23 03:43:16.458811+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7e825dd352bc6a11ab81cb8068e325e6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>410693</th>\n",
|
||
" <td>1285967</td>\n",
|
||
" <td>2023-10-21 23:07:06+02:00</td>\n",
|
||
" <td>399969</td>\n",
|
||
" <td>2023-10-23 03:43:16.459738+02:00</td>\n",
|
||
" <td>2023-10-23 03:43:16.459738+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fdb92627a48d6ba8fa817d60a83dbea8</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>410694</th>\n",
|
||
" <td>1285968</td>\n",
|
||
" <td>2023-10-21 23:07:39+02:00</td>\n",
|
||
" <td>399969</td>\n",
|
||
" <td>2023-10-23 03:43:16.462409+02:00</td>\n",
|
||
" <td>2023-10-23 03:43:16.462409+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>e9dbaff4f7037a5b0efa11263584dfad</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>410695 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id purchase_date customer_id \\\n",
|
||
"0 861761 2019-03-01 16:28:49+01:00 4966 \n",
|
||
"1 861762 2019-03-01 16:29:11+01:00 4966 \n",
|
||
"2 861763 2019-03-01 16:29:17+01:00 4966 \n",
|
||
"3 861764 2019-03-01 16:29:19+01:00 4966 \n",
|
||
"4 861765 2019-03-01 16:32:08+01:00 405994 \n",
|
||
"... ... ... ... \n",
|
||
"410690 1285964 2023-10-21 21:46:41+02:00 517309 \n",
|
||
"410691 1285965 2023-10-21 21:47:07+02:00 517309 \n",
|
||
"410692 1285966 2023-10-21 21:47:20+02:00 517309 \n",
|
||
"410693 1285967 2023-10-21 23:07:06+02:00 399969 \n",
|
||
"410694 1285968 2023-10-21 23:07:39+02:00 399969 \n",
|
||
"\n",
|
||
" created_at updated_at \\\n",
|
||
"0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n",
|
||
"1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n",
|
||
"2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n",
|
||
"3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n",
|
||
"4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n",
|
||
"... ... ... \n",
|
||
"410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n",
|
||
"410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n",
|
||
"410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n",
|
||
"410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n",
|
||
"410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n",
|
||
"\n",
|
||
" number identifier \n",
|
||
"0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n",
|
||
"1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n",
|
||
"2 NaN e1155cf26b34f792bdb23e49244d7264 \n",
|
||
"3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n",
|
||
"4 NaN 1b763278914f1309e357abe5033a3f0f \n",
|
||
"... ... ... \n",
|
||
"410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n",
|
||
"410691 NaN ee65532087132145daa6154fbae050ea \n",
|
||
"410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n",
|
||
"410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n",
|
||
"410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n",
|
||
"\n",
|
||
"[410695 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"purchases"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|