Ajout Notebook simple

This commit is contained in:
Antoine JOUBREL 2024-01-01 12:30:35 +00:00
parent 2e1054f4f9
commit 3e60560242

511
Clean-Notebook.ipynb Normal file
View File

@ -0,0 +1,511 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"## Exemple sur bdc2324-data/11"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/11/11campaign_stats.csv',\n",
" 'bdc2324-data/11/11campaigns.csv',\n",
" 'bdc2324-data/11/11categories.csv',\n",
" 'bdc2324-data/11/11countries.csv',\n",
" 'bdc2324-data/11/11currencies.csv',\n",
" 'bdc2324-data/11/11customer_target_mappings.csv',\n",
" 'bdc2324-data/11/11customersplus.csv',\n",
" 'bdc2324-data/11/11event_types.csv',\n",
" 'bdc2324-data/11/11events.csv',\n",
" 'bdc2324-data/11/11facilities.csv',\n",
" 'bdc2324-data/11/11link_stats.csv',\n",
" 'bdc2324-data/11/11pricing_formulas.csv',\n",
" 'bdc2324-data/11/11product_packs.csv',\n",
" 'bdc2324-data/11/11products.csv',\n",
" 'bdc2324-data/11/11products_groups.csv',\n",
" 'bdc2324-data/11/11purchases.csv',\n",
" 'bdc2324-data/11/11representation_category_capacities.csv',\n",
" 'bdc2324-data/11/11representations.csv',\n",
" 'bdc2324-data/11/11seasons.csv',\n",
" 'bdc2324-data/11/11structure_tag_mappings.csv',\n",
" 'bdc2324-data/11/11suppliers.csv',\n",
" 'bdc2324-data/11/11tags.csv',\n",
" 'bdc2324-data/11/11target_types.csv',\n",
" 'bdc2324-data/11/11targets.csv',\n",
" 'bdc2324-data/11/11tickets.csv']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BUCKET = \"bdc2324-data/11\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
"metadata": {},
"source": [
"## Type de client au globale"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d22aa131-5069-43d4-a42e-24f38cc7240d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
" 'extra_field'],\n",
" dtype='object')\n",
"(124302, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 124302 entries, 0 to 124301\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 124302 non-null int64 \n",
" 1 customer_id 124302 non-null int64 \n",
" 2 target_id 124302 non-null int64 \n",
" 3 created_at 124296 non-null object \n",
" 4 updated_at 124296 non-null object \n",
" 5 name 0 non-null float64\n",
" 6 extra_field 0 non-null float64\n",
"dtypes: float64(2), int64(3), object(2)\n",
"memory usage: 6.6+ MB\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "967b20e2-5a30-4724-989f-b9e39c7c67e7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>793889</td>\n",
" <td>344151</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>793890</td>\n",
" <td>344152</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>793891</td>\n",
" <td>344153</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>793892</td>\n",
" <td>344154</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>793893</td>\n",
" <td>344155</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124297</th>\n",
" <td>742001</td>\n",
" <td>329855</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124298</th>\n",
" <td>742002</td>\n",
" <td>329856</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124299</th>\n",
" <td>742000</td>\n",
" <td>329854</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124300</th>\n",
" <td>742003</td>\n",
" <td>329857</td>\n",
" <td>134</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124301</th>\n",
" <td>741996</td>\n",
" <td>329850</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>124302 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id target_id created_at \\\n",
"0 793889 344151 101 2022-09-29 17:55:41.083666+02:00 \n",
"1 793890 344152 101 2022-09-29 19:16:07.252114+02:00 \n",
"2 793891 344153 101 2022-09-29 19:55:10.443450+02:00 \n",
"3 793892 344154 101 2022-09-29 20:16:08.269407+02:00 \n",
"4 793893 344155 101 2022-09-29 21:03:40.541998+02:00 \n",
"... ... ... ... ... \n",
"124297 742001 329855 101 2022-07-11 18:17:09.607162+02:00 \n",
"124298 742002 329856 101 2022-07-11 18:44:45.636248+02:00 \n",
"124299 742000 329854 101 2022-07-11 17:46:48.914507+02:00 \n",
"124300 742003 329857 134 2022-07-11 18:44:55.915889+02:00 \n",
"124301 741996 329850 101 2022-07-11 16:52:37.227487+02:00 \n",
"\n",
" updated_at name extra_field \n",
"0 2022-09-29 17:55:41.083666+02:00 NaN NaN \n",
"1 2022-09-29 19:16:07.252114+02:00 NaN NaN \n",
"2 2022-09-29 19:55:10.443450+02:00 NaN NaN \n",
"3 2022-09-29 20:16:08.269407+02:00 NaN NaN \n",
"4 2022-09-29 21:03:40.541998+02:00 NaN NaN \n",
"... ... ... ... \n",
"124297 2022-07-11 18:17:09.607162+02:00 NaN NaN \n",
"124298 2022-07-11 18:44:45.636248+02:00 NaN NaN \n",
"124299 2022-07-11 17:46:48.914507+02:00 NaN NaN \n",
"124300 2022-07-11 18:44:55.915889+02:00 NaN NaN \n",
"124301 2022-07-11 16:52:37.227487+02:00 NaN NaN \n",
"\n",
"[124302 rows x 7 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(4, 6)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 4 non-null int64 \n",
" 1 is_import 4 non-null bool \n",
" 2 name 4 non-null object\n",
" 3 created_at 4 non-null object\n",
" 4 updated_at 4 non-null object\n",
" 5 identifier 4 non-null object\n",
"dtypes: bool(1), int64(1), object(4)\n",
"memory usage: 292.0+ bytes\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11target_types.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(target_types.columns)\n",
"print(target_types.shape)\n",
"target_types.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_import</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>manual_structure</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>382bca214204a2d3462f5ec2728d5d1e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>manual_import</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_import name created_at \\\n",
"0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
"1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
"2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
"3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
"1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
"2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
"3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_types"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
"metadata": {},
"outputs": [],
"source": [
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}