BDC-team-1/Clean-Notebook.ipynb

512 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"## Exemple sur bdc2324-data/11"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/11/11campaign_stats.csv',\n",
" 'bdc2324-data/11/11campaigns.csv',\n",
" 'bdc2324-data/11/11categories.csv',\n",
" 'bdc2324-data/11/11countries.csv',\n",
" 'bdc2324-data/11/11currencies.csv',\n",
" 'bdc2324-data/11/11customer_target_mappings.csv',\n",
" 'bdc2324-data/11/11customersplus.csv',\n",
" 'bdc2324-data/11/11event_types.csv',\n",
" 'bdc2324-data/11/11events.csv',\n",
" 'bdc2324-data/11/11facilities.csv',\n",
" 'bdc2324-data/11/11link_stats.csv',\n",
" 'bdc2324-data/11/11pricing_formulas.csv',\n",
" 'bdc2324-data/11/11product_packs.csv',\n",
" 'bdc2324-data/11/11products.csv',\n",
" 'bdc2324-data/11/11products_groups.csv',\n",
" 'bdc2324-data/11/11purchases.csv',\n",
" 'bdc2324-data/11/11representation_category_capacities.csv',\n",
" 'bdc2324-data/11/11representations.csv',\n",
" 'bdc2324-data/11/11seasons.csv',\n",
" 'bdc2324-data/11/11structure_tag_mappings.csv',\n",
" 'bdc2324-data/11/11suppliers.csv',\n",
" 'bdc2324-data/11/11tags.csv',\n",
" 'bdc2324-data/11/11target_types.csv',\n",
" 'bdc2324-data/11/11targets.csv',\n",
" 'bdc2324-data/11/11tickets.csv']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BUCKET = \"bdc2324-data/11\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
"metadata": {},
"source": [
"## Type de client au globale"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d22aa131-5069-43d4-a42e-24f38cc7240d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
" 'extra_field'],\n",
" dtype='object')\n",
"(124302, 7)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 124302 entries, 0 to 124301\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 124302 non-null int64 \n",
" 1 customer_id 124302 non-null int64 \n",
" 2 target_id 124302 non-null int64 \n",
" 3 created_at 124296 non-null object \n",
" 4 updated_at 124296 non-null object \n",
" 5 name 0 non-null float64\n",
" 6 extra_field 0 non-null float64\n",
"dtypes: float64(2), int64(3), object(2)\n",
"memory usage: 6.6+ MB\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "967b20e2-5a30-4724-989f-b9e39c7c67e7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>793889</td>\n",
" <td>344151</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>2022-09-29 17:55:41.083666+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>793890</td>\n",
" <td>344152</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>2022-09-29 19:16:07.252114+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>793891</td>\n",
" <td>344153</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>2022-09-29 19:55:10.443450+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>793892</td>\n",
" <td>344154</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>2022-09-29 20:16:08.269407+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>793893</td>\n",
" <td>344155</td>\n",
" <td>101</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>2022-09-29 21:03:40.541998+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124297</th>\n",
" <td>742001</td>\n",
" <td>329855</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>2022-07-11 18:17:09.607162+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124298</th>\n",
" <td>742002</td>\n",
" <td>329856</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>2022-07-11 18:44:45.636248+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124299</th>\n",
" <td>742000</td>\n",
" <td>329854</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>2022-07-11 17:46:48.914507+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124300</th>\n",
" <td>742003</td>\n",
" <td>329857</td>\n",
" <td>134</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>2022-07-11 18:44:55.915889+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124301</th>\n",
" <td>741996</td>\n",
" <td>329850</td>\n",
" <td>101</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>2022-07-11 16:52:37.227487+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>124302 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id target_id created_at \\\n",
"0 793889 344151 101 2022-09-29 17:55:41.083666+02:00 \n",
"1 793890 344152 101 2022-09-29 19:16:07.252114+02:00 \n",
"2 793891 344153 101 2022-09-29 19:55:10.443450+02:00 \n",
"3 793892 344154 101 2022-09-29 20:16:08.269407+02:00 \n",
"4 793893 344155 101 2022-09-29 21:03:40.541998+02:00 \n",
"... ... ... ... ... \n",
"124297 742001 329855 101 2022-07-11 18:17:09.607162+02:00 \n",
"124298 742002 329856 101 2022-07-11 18:44:45.636248+02:00 \n",
"124299 742000 329854 101 2022-07-11 17:46:48.914507+02:00 \n",
"124300 742003 329857 134 2022-07-11 18:44:55.915889+02:00 \n",
"124301 741996 329850 101 2022-07-11 16:52:37.227487+02:00 \n",
"\n",
" updated_at name extra_field \n",
"0 2022-09-29 17:55:41.083666+02:00 NaN NaN \n",
"1 2022-09-29 19:16:07.252114+02:00 NaN NaN \n",
"2 2022-09-29 19:55:10.443450+02:00 NaN NaN \n",
"3 2022-09-29 20:16:08.269407+02:00 NaN NaN \n",
"4 2022-09-29 21:03:40.541998+02:00 NaN NaN \n",
"... ... ... ... \n",
"124297 2022-07-11 18:17:09.607162+02:00 NaN NaN \n",
"124298 2022-07-11 18:44:45.636248+02:00 NaN NaN \n",
"124299 2022-07-11 17:46:48.914507+02:00 NaN NaN \n",
"124300 2022-07-11 18:44:55.915889+02:00 NaN NaN \n",
"124301 2022-07-11 16:52:37.227487+02:00 NaN NaN \n",
"\n",
"[124302 rows x 7 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customer_target_mappings"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
"(4, 6)\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 4 non-null int64 \n",
" 1 is_import 4 non-null bool \n",
" 2 name 4 non-null object\n",
" 3 created_at 4 non-null object\n",
" 4 updated_at 4 non-null object\n",
" 5 identifier 4 non-null object\n",
"dtypes: bool(1), int64(1), object(4)\n",
"memory usage: 292.0+ bytes\n"
]
}
],
"source": [
"# Segmentation existante\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11target_types.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(target_types.columns)\n",
"print(target_types.shape)\n",
"target_types.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_import</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>2021-04-29 13:42:14.111085+02:00</td>\n",
" <td>fb27e81baa4debc6a4e1a8639c20e808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>manual_structure</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>2021-05-07 15:20:00.626650+02:00</td>\n",
" <td>382bca214204a2d3462f5ec2728d5d1e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>2021-09-09 14:27:47.641302+02:00</td>\n",
" <td>e0f4b8693184850fefd6d2a38f10584e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>manual_import</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>2021-04-29 13:49:30.107110+02:00</td>\n",
" <td>12213df2ce68a624e4c0070521437bac</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_import name created_at \\\n",
"0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
"1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
"2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
"3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
"1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
"2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
"3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_types"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
"metadata": {},
"outputs": [],
"source": [
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}