BDC-team-1/Clean-Notebook.ipynb
2024-01-01 12:58:56 +00:00

1578 lines
58 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"## Exemple sur bdc2324-data/11"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/2/2campaign_stats.csv',\n",
" 'bdc2324-data/2/2campaigns.csv',\n",
" 'bdc2324-data/2/2categories.csv',\n",
" 'bdc2324-data/2/2contribution_sites.csv',\n",
" 'bdc2324-data/2/2contributions.csv',\n",
" 'bdc2324-data/2/2countries.csv',\n",
" 'bdc2324-data/2/2currencies.csv',\n",
" 'bdc2324-data/2/2customer_target_mappings.csv',\n",
" 'bdc2324-data/2/2customersplus.csv',\n",
" 'bdc2324-data/2/2event_types.csv',\n",
" 'bdc2324-data/2/2events.csv',\n",
" 'bdc2324-data/2/2facilities.csv',\n",
" 'bdc2324-data/2/2link_stats.csv',\n",
" 'bdc2324-data/2/2pricing_formulas.csv',\n",
" 'bdc2324-data/2/2product_packs.csv',\n",
" 'bdc2324-data/2/2products.csv',\n",
" 'bdc2324-data/2/2products_groups.csv',\n",
" 'bdc2324-data/2/2purchases.csv',\n",
" 'bdc2324-data/2/2representation_category_capacities.csv',\n",
" 'bdc2324-data/2/2representations.csv',\n",
" 'bdc2324-data/2/2seasons.csv',\n",
" 'bdc2324-data/2/2structure_tag_mappings.csv',\n",
" 'bdc2324-data/2/2suppliers.csv',\n",
" 'bdc2324-data/2/2tags.csv',\n",
" 'bdc2324-data/2/2target_types.csv',\n",
" 'bdc2324-data/2/2targets.csv',\n",
" 'bdc2324-data/2/2tickets.csv']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BUCKET = \"bdc2324-data/2\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
"metadata": {},
"source": [
"## Type de client au globale"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d22aa131-5069-43d4-a42e-24f38cc7240d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
" 'extra_field'],\n",
" dtype='object')\n",
"(124302, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 124302 entries, 0 to 124301\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 124302 non-null int64 \n",
" 1 customer_id 124302 non-null int64 \n",
" 2 target_id 124302 non-null int64 \n",
" 3 created_at 124296 non-null object \n",
" 4 updated_at 124296 non-null object \n",
" 5 name 0 non-null float64\n",
" 6 extra_field 0 non-null float64\n",
"dtypes: float64(2), int64(3), object(2)\n",
"memory usage: 6.6+ MB\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "967b20e2-5a30-4724-989f-b9e39c7c67e7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>793889</td>\n",
" <td>344151</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>793890</td>\n",
" <td>344152</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>793891</td>\n",
" <td>344153</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>793892</td>\n",
" <td>344154</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>793893</td>\n",
" <td>344155</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124297</th>\n",
" <td>742001</td>\n",
" <td>329855</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124298</th>\n",
" <td>742002</td>\n",
" <td>329856</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124299</th>\n",
" <td>742000</td>\n",
" <td>329854</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124300</th>\n",
" <td>742003</td>\n",
" <td>329857</td>\n",
" <td>134</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124301</th>\n",
" <td>741996</td>\n",
" <td>329850</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>124302 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id target_id created_at \\\n",
"0 793889 344151 101 2022-09-29 17:55:41.083666+02:00 \n",
"1 793890 344152 101 2022-09-29 19:16:07.252114+02:00 \n",
"2 793891 344153 101 2022-09-29 19:55:10.443450+02:00 \n",
"3 793892 344154 101 2022-09-29 20:16:08.269407+02:00 \n",
"4 793893 344155 101 2022-09-29 21:03:40.541998+02:00 \n",
"... ... ... ... ... \n",
"124297 742001 329855 101 2022-07-11 18:17:09.607162+02:00 \n",
"124298 742002 329856 101 2022-07-11 18:44:45.636248+02:00 \n",
"124299 742000 329854 101 2022-07-11 17:46:48.914507+02:00 \n",
"124300 742003 329857 134 2022-07-11 18:44:55.915889+02:00 \n",
"124301 741996 329850 101 2022-07-11 16:52:37.227487+02:00 \n",
"\n",
" updated_at name extra_field \n",
"0 2022-09-29 17:55:41.083666+02:00 NaN NaN \n",
"1 2022-09-29 19:16:07.252114+02:00 NaN NaN \n",
"2 2022-09-29 19:55:10.443450+02:00 NaN NaN \n",
"3 2022-09-29 20:16:08.269407+02:00 NaN NaN \n",
"4 2022-09-29 21:03:40.541998+02:00 NaN NaN \n",
"... ... ... ... \n",
"124297 2022-07-11 18:17:09.607162+02:00 NaN NaN \n",
"124298 2022-07-11 18:44:45.636248+02:00 NaN NaN \n",
"124299 2022-07-11 17:46:48.914507+02:00 NaN NaN \n",
"124300 2022-07-11 18:44:55.915889+02:00 NaN NaN \n",
"124301 2022-07-11 16:52:37.227487+02:00 NaN NaN \n",
"\n",
"[124302 rows x 7 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings['extra_field'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "47bc8453-0693-4838-8bd8-4d800a82c496",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(4, 6)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 4 non-null int64 \n",
" 1 is_import 4 non-null bool \n",
" 2 name 4 non-null object\n",
" 3 created_at 4 non-null object\n",
" 4 updated_at 4 non-null object\n",
" 5 identifier 4 non-null object\n",
"dtypes: bool(1), int64(1), object(4)\n",
"memory usage: 292.0+ bytes\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11target_types.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(target_types.columns)\n",
"print(target_types.shape)\n",
"target_types.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_import</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>manual_structure</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>382bca214204a2d3462f5ec2728d5d1e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>manual_import</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_import name created_at \\\n",
"0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
"1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
"2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
"3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
"1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
"2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
"3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_types"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8dd74e87-97c2-493d-b19f-971b684078d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(20, 5)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 20 entries, 0 to 19\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 20 non-null int64 \n",
" 1 name 19 non-null object\n",
" 2 created_at 20 non-null object\n",
" 3 updated_at 20 non-null object\n",
" 4 identifier 20 non-null object\n",
"dtypes: int64(1), object(4)\n",
"memory usage: 928.0+ bytes\n"
]
}
],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" tags = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(tags.columns)\n",
"print(tags.shape)\n",
"tags.info()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "91d54732-666c-4250-ba91-5c9b83d4712a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>ens-écoles</td>\n",
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
" <td>2021-05-07 15:24:19.808501+02:00</td>\n",
" <td>b6a360c5f84595940c5774f13fd39cc3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
" <td>2021-05-07 15:24:19.805589+02:00</td>\n",
" <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>ecoles primaires rennes</td>\n",
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
" <td>2021-05-07 15:29:06.388415+02:00</td>\n",
" <td>ca8649dd64c240d118f60b07d11a7053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>Angers Nantes Opéra</td>\n",
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
" <td>2023-01-27 15:59:58.187557+01:00</td>\n",
" <td>f8f500f937fe312542399299cdc13f7e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>Opéras</td>\n",
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
" <td>2023-01-27 16:03:59.654938+01:00</td>\n",
" <td>22eb2c616983ec7b54a093f84b230505</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>Ministère de la Culture</td>\n",
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
" <td>2023-01-30 11:22:29.636813+01:00</td>\n",
" <td>1b8c5c08fde000d90905a3d14af7763d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8</td>\n",
" <td>Orchestres</td>\n",
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
" <td>2023-01-30 11:33:56.392799+01:00</td>\n",
" <td>7c2aee0c80642d7e325a450f2dec45e5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>9</td>\n",
" <td>Cooperative</td>\n",
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
" <td>2023-01-31 14:44:38.471146+01:00</td>\n",
" <td>6c88c36ffaab88d255865aa3111d7686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>10</td>\n",
" <td>Théâtres</td>\n",
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
" <td>2023-01-31 14:45:17.804428+01:00</td>\n",
" <td>b2c19672df82021702b79482c8cda85a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>11</td>\n",
" <td>La co[opera]tive</td>\n",
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
" <td>2023-02-16 17:11:35.004478+01:00</td>\n",
" <td>5dbaa3a1f278c0fcf981d447ad20957a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>12</td>\n",
" <td>Ville de Rennes</td>\n",
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
" <td>2023-02-16 17:37:13.816196+01:00</td>\n",
" <td>bc483d04d9c3a08f167a3ce64366ca72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>13</td>\n",
" <td>Ensembles en résidence</td>\n",
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
" <td>2023-02-16 17:55:54.877374+01:00</td>\n",
" <td>e70635e771de13268dccf02bb2abfaf9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>14</td>\n",
" <td>Ministère</td>\n",
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
" <td>2023-02-17 11:17:54.429462+01:00</td>\n",
" <td>a3f0582853fd19f5b57e3651f8a20e7a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>15</td>\n",
" <td>Rennes métropole</td>\n",
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
" <td>2023-02-17 11:53:24.490786+01:00</td>\n",
" <td>e98b8db5941b96c29c353b6f2f502055</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>16</td>\n",
" <td>Ville de Rennes - équipements culturels</td>\n",
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
" <td>2023-02-17 12:00:10.649104+01:00</td>\n",
" <td>a44edffc7edb852982efa7f4aa6d0e25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>17</td>\n",
" <td>Structures culturelles rennaises</td>\n",
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
" <td>2023-02-17 12:05:55.583016+01:00</td>\n",
" <td>241550517e4e3b1c926e9aeab0f621cd</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>18</td>\n",
" <td>Université Rennes 2</td>\n",
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
" <td>2023-02-17 14:23:44.832959+01:00</td>\n",
" <td>4057c5cee51c4e10aa819f0cf48adc3f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>19</td>\n",
" <td>Centres chorégraphiques nationaux</td>\n",
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
" <td>2023-02-17 15:29:41.827321+01:00</td>\n",
" <td>41e75941dfb766365498d917abe0102f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>20</td>\n",
" <td>Télévision</td>\n",
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
" <td>2023-02-17 15:46:13.746092+01:00</td>\n",
" <td>36d6409c539dd79c1f3af8c5948603eb</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>21</td>\n",
" <td>structures culturelles nationales</td>\n",
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
" <td>2023-02-17 15:56:00.555722+01:00</td>\n",
" <td>5311cf7e42aac53289e1c4a338d5cfa4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name \\\n",
"0 2 ens-écoles \n",
"1 1 NaN \n",
"2 4 ecoles primaires rennes \n",
"3 5 Angers Nantes Opéra \n",
"4 6 Opéras \n",
"5 7 Ministère de la Culture \n",
"6 8 Orchestres \n",
"7 9 Cooperative \n",
"8 10 Théâtres \n",
"9 11 La co[opera]tive \n",
"10 12 Ville de Rennes \n",
"11 13 Ensembles en résidence \n",
"12 14 Ministère \n",
"13 15 Rennes métropole \n",
"14 16 Ville de Rennes - équipements culturels \n",
"15 17 Structures culturelles rennaises \n",
"16 18 Université Rennes 2 \n",
"17 19 Centres chorégraphiques nationaux \n",
"18 20 Télévision \n",
"19 21 structures culturelles nationales \n",
"\n",
" created_at updated_at \\\n",
"0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n",
"1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n",
"2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n",
"3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n",
"4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n",
"5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n",
"6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n",
"7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n",
"8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n",
"9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n",
"10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n",
"11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n",
"12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n",
"13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n",
"14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n",
"15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n",
"16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n",
"17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n",
"18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n",
"19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n",
"\n",
" identifier \n",
"0 b6a360c5f84595940c5774f13fd39cc3 \n",
"1 d41d8cd98f00b204e9800998ecf8427e \n",
"2 ca8649dd64c240d118f60b07d11a7053 \n",
"3 f8f500f937fe312542399299cdc13f7e \n",
"4 22eb2c616983ec7b54a093f84b230505 \n",
"5 1b8c5c08fde000d90905a3d14af7763d \n",
"6 7c2aee0c80642d7e325a450f2dec45e5 \n",
"7 6c88c36ffaab88d255865aa3111d7686 \n",
"8 b2c19672df82021702b79482c8cda85a \n",
"9 5dbaa3a1f278c0fcf981d447ad20957a \n",
"10 bc483d04d9c3a08f167a3ce64366ca72 \n",
"11 e70635e771de13268dccf02bb2abfaf9 \n",
"12 a3f0582853fd19f5b57e3651f8a20e7a \n",
"13 e98b8db5941b96c29c353b6f2f502055 \n",
"14 a44edffc7edb852982efa7f4aa6d0e25 \n",
"15 241550517e4e3b1c926e9aeab0f621cd \n",
"16 4057c5cee51c4e10aa819f0cf48adc3f \n",
"17 41e75941dfb766365498d917abe0102f \n",
"18 36d6409c539dd79c1f3af8c5948603eb \n",
"19 5311cf7e42aac53289e1c4a338d5cfa4 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n",
"(179, 5)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 179 entries, 0 to 178\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 179 non-null int64 \n",
" 1 structure_id 179 non-null int64 \n",
" 2 tag_id 179 non-null int64 \n",
" 3 created_at 179 non-null object\n",
" 4 updated_at 179 non-null object\n",
"dtypes: int64(3), object(2)\n",
"memory usage: 7.1+ KB\n"
]
}
],
"source": [
"# Structure = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(structure_tag_mappings.columns)\n",
"print(structure_tag_mappings.shape)\n",
"structure_tag_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>structure_id</th>\n",
" <th>tag_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>123</td>\n",
" <td>187</td>\n",
" <td>6</td>\n",
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
" <td>2023-01-27 16:03:59.680222+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
" <td>2021-05-07 15:24:19.872895+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
" <td>2021-05-07 15:24:19.873830+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
" <td>2021-05-07 15:24:19.874628+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
" <td>2021-05-07 15:24:19.875421+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174</th>\n",
" <td>184</td>\n",
" <td>236</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
" <td>2023-02-17 16:35:25.041114+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>185</td>\n",
" <td>237</td>\n",
" <td>17</td>\n",
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
" <td>2023-02-17 16:39:10.799478+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>176</th>\n",
" <td>186</td>\n",
" <td>238</td>\n",
" <td>19</td>\n",
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
" <td>2023-02-17 16:53:21.098690+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>187</td>\n",
" <td>239</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
" <td>2023-02-17 16:57:42.623481+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>188</td>\n",
" <td>240</td>\n",
" <td>10</td>\n",
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
" <td>2023-02-17 16:59:22.067723+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>179 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" id structure_id tag_id created_at \\\n",
"0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n",
"1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n",
"2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n",
"3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n",
"4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n",
".. ... ... ... ... \n",
"174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n",
"175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n",
"176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n",
"177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n",
"178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n",
"\n",
" updated_at \n",
"0 2023-01-27 16:03:59.680222+01:00 \n",
"1 2021-05-07 15:24:19.872895+02:00 \n",
"2 2021-05-07 15:24:19.873830+02:00 \n",
"3 2021-05-07 15:24:19.874628+02:00 \n",
"4 2021-05-07 15:24:19.875421+02:00 \n",
".. ... \n",
"174 2023-02-17 16:35:25.041114+01:00 \n",
"175 2023-02-17 16:39:10.799478+01:00 \n",
"176 2023-02-17 16:53:21.098690+01:00 \n",
"177 2023-02-17 16:57:42.623481+01:00 \n",
"178 2023-02-17 16:59:22.067723+01:00 \n",
"\n",
"[179 rows x 5 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"structure_tag_mappings"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "41bf1529-5a7c-409e-9791-2024c08c11f0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
" 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
" 'tenant_id'],\n",
" dtype='object')\n",
"(71307, 43)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 71307 entries, 0 to 71306\n",
"Data columns (total 43 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 71307 non-null int64 \n",
" 1 lastname 41045 non-null object \n",
" 2 firstname 39140 non-null object \n",
" 3 birthdate 18174 non-null object \n",
" 4 email 58203 non-null object \n",
" 5 street_id 71307 non-null int64 \n",
" 6 created_at 71307 non-null object \n",
" 7 updated_at 71307 non-null object \n",
" 8 civility 0 non-null float64\n",
" 9 is_partner 71307 non-null bool \n",
" 10 extra 0 non-null float64\n",
" 11 deleted_at 0 non-null float64\n",
" 12 reference 0 non-null float64\n",
" 13 gender 71307 non-null int64 \n",
" 14 is_email_true 71307 non-null bool \n",
" 15 extra_field 0 non-null float64\n",
" 16 identifier 71307 non-null object \n",
" 17 opt_in 71307 non-null bool \n",
" 18 structure_id 616 non-null float64\n",
" 19 note 451 non-null object \n",
" 20 profession 812 non-null object \n",
" 21 language 0 non-null float64\n",
" 22 mcp_contact_id 22417 non-null float64\n",
" 23 need_reload 71307 non-null bool \n",
" 24 last_buying_date 34040 non-null object \n",
" 25 max_price 34040 non-null float64\n",
" 26 ticket_sum 71307 non-null int64 \n",
" 27 average_price 68694 non-null float64\n",
" 28 fidelity 71307 non-null int64 \n",
" 29 average_purchase_delay 34040 non-null float64\n",
" 30 average_price_basket 34040 non-null float64\n",
" 31 average_ticket_basket 34040 non-null float64\n",
" 32 total_price 36653 non-null float64\n",
" 33 preferred_category 0 non-null float64\n",
" 34 preferred_supplier 0 non-null float64\n",
" 35 preferred_formula 0 non-null float64\n",
" 36 purchase_count 71307 non-null int64 \n",
" 37 first_buying_date 34040 non-null object \n",
" 38 last_visiting_date 0 non-null float64\n",
" 39 zipcode 33756 non-null object \n",
" 40 country 39910 non-null object \n",
" 41 age 18174 non-null float64\n",
" 42 tenant_id 71307 non-null int64 \n",
"dtypes: bool(4), float64(19), int64(7), object(13)\n",
"memory usage: 21.5+ MB\n"
]
}
],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customersplus.columns)\n",
"print(customersplus.shape)\n",
"customersplus.info()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>...</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>286834</td>\n",
" <td>lastname286834</td>\n",
" <td>firstname286834</td>\n",
" <td>NaN</td>\n",
" <td>email286834</td>\n",
" <td>6</td>\n",
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
" <td>2022-05-19 10:09:09.361137+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>330695</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email330695</td>\n",
" <td>1</td>\n",
" <td>2022-07-16 04:10:34.135134+02:00</td>\n",
" <td>2022-07-16 04:10:34.156704+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>330978</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email330978</td>\n",
" <td>1</td>\n",
" <td>2022-07-21 22:14:09.811721+02:00</td>\n",
" <td>2022-07-21 22:14:09.836051+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>338697</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email338697</td>\n",
" <td>1</td>\n",
" <td>2022-09-15 19:02:03.950536+02:00</td>\n",
" <td>2022-09-15 19:02:03.985642+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>338726</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email338726</td>\n",
" <td>1</td>\n",
" <td>2022-09-16 01:24:40.719882+02:00</td>\n",
" <td>2022-09-16 01:24:40.742753+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71302</th>\n",
" <td>27105</td>\n",
" <td>lastname27105</td>\n",
" <td>firstname27105</td>\n",
" <td>1957-01-26</td>\n",
" <td>email27105</td>\n",
" <td>205024</td>\n",
" <td>2021-04-22 15:12:59.986534+02:00</td>\n",
" <td>2023-09-12 18:59:31.613235+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2018-12-31 18:56:57+01:00</td>\n",
" <td>NaN</td>\n",
" <td>35700</td>\n",
" <td>fr</td>\n",
" <td>66.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71303</th>\n",
" <td>27108</td>\n",
" <td>lastname27108</td>\n",
" <td>firstname27108</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>205024</td>\n",
" <td>2021-04-22 15:12:59.989197+02:00</td>\n",
" <td>2023-09-12 18:27:34.380843+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2015-12-29 14:51:46+01:00</td>\n",
" <td>NaN</td>\n",
" <td>35700</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71304</th>\n",
" <td>27110</td>\n",
" <td>lastname27110</td>\n",
" <td>firstname27110</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:12:59.991029+02:00</td>\n",
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2018-12-31 19:12:59+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71305</th>\n",
" <td>10607</td>\n",
" <td>lastname10607</td>\n",
" <td>firstname10607</td>\n",
" <td>1963-01-04</td>\n",
" <td>email10607</td>\n",
" <td>313332</td>\n",
" <td>2021-04-22 14:56:45.742226+02:00</td>\n",
" <td>2023-09-12 17:55:17.723195+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>26</td>\n",
" <td>2015-10-10 14:11:21+02:00</td>\n",
" <td>NaN</td>\n",
" <td>35850</td>\n",
" <td>fr</td>\n",
" <td>60.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71306</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>44.0</td>\n",
" <td>1556</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>71307 rows × 43 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"0 286834 lastname286834 firstname286834 NaN email286834 \n",
"1 330695 NaN NaN NaN email330695 \n",
"2 330978 NaN NaN NaN email330978 \n",
"3 338697 NaN NaN NaN email338697 \n",
"4 338726 NaN NaN NaN email338726 \n",
"... ... ... ... ... ... \n",
"71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n",
"71303 27108 lastname27108 firstname27108 NaN NaN \n",
"71304 27110 lastname27110 firstname27110 NaN NaN \n",
"71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n",
"71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"\n",
" street_id created_at \\\n",
"0 6 2022-05-19 10:09:09.361137+02:00 \n",
"1 1 2022-07-16 04:10:34.135134+02:00 \n",
"2 1 2022-07-21 22:14:09.811721+02:00 \n",
"3 1 2022-09-15 19:02:03.950536+02:00 \n",
"4 1 2022-09-16 01:24:40.719882+02:00 \n",
"... ... ... \n",
"71302 205024 2021-04-22 15:12:59.986534+02:00 \n",
"71303 205024 2021-04-22 15:12:59.989197+02:00 \n",
"71304 6 2021-04-22 15:12:59.991029+02:00 \n",
"71305 313332 2021-04-22 14:56:45.742226+02:00 \n",
"71306 6 2021-04-22 15:06:30.120537+02:00 \n",
"\n",
" updated_at civility is_partner ... \\\n",
"0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n",
"1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n",
"2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n",
"3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n",
"4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n",
"... ... ... ... ... \n",
"71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n",
"71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n",
"71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
"71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n",
"71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"\n",
" preferred_category preferred_supplier preferred_formula \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"71302 NaN NaN NaN \n",
"71303 NaN NaN NaN \n",
"71304 NaN NaN NaN \n",
"71305 NaN NaN NaN \n",
"71306 NaN NaN NaN \n",
"\n",
" purchase_count first_buying_date last_visiting_date zipcode \\\n",
"0 0 NaN NaN NaN \n",
"1 0 NaN NaN NaN \n",
"2 0 NaN NaN NaN \n",
"3 0 NaN NaN NaN \n",
"4 0 NaN NaN NaN \n",
"... ... ... ... ... \n",
"71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n",
"71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n",
"71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n",
"71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n",
"71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n",
"\n",
" country age tenant_id \n",
"0 fr NaN 1556 \n",
"1 NaN NaN 1556 \n",
"2 NaN NaN 1556 \n",
"3 NaN NaN 1556 \n",
"4 NaN NaN 1556 \n",
"... ... ... ... \n",
"71302 fr 66.0 1556 \n",
"71303 fr NaN 1556 \n",
"71304 fr NaN 1556 \n",
"71305 fr 60.0 1556 \n",
"71306 fr 44.0 1556 \n",
"\n",
"[71307 rows x 43 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customersplus"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
"metadata": {},
"outputs": [],
"source": [
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}