diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb new file mode 100644 index 0000000..99ea3e5 --- /dev/null +++ b/Clean-Notebook.ipynb @@ -0,0 +1,511 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "56b3d44e-1e3f-4726-9916-0f9af107860e", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "15103481-8d74-404c-aa09-7601fe7730da", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import s3fs\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "markdown", + "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", + "metadata": {}, + "source": [ + "## Exemple sur bdc2324-data/11" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "699664b9-eee4-4f8d-a207-e524526560c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/11/11campaign_stats.csv',\n", + " 'bdc2324-data/11/11campaigns.csv',\n", + " 'bdc2324-data/11/11categories.csv',\n", + " 'bdc2324-data/11/11countries.csv',\n", + " 'bdc2324-data/11/11currencies.csv',\n", + " 'bdc2324-data/11/11customer_target_mappings.csv',\n", + " 'bdc2324-data/11/11customersplus.csv',\n", + " 'bdc2324-data/11/11event_types.csv',\n", + " 'bdc2324-data/11/11events.csv',\n", + " 'bdc2324-data/11/11facilities.csv',\n", + " 'bdc2324-data/11/11link_stats.csv',\n", + " 'bdc2324-data/11/11pricing_formulas.csv',\n", + " 'bdc2324-data/11/11product_packs.csv',\n", + " 'bdc2324-data/11/11products.csv',\n", + " 'bdc2324-data/11/11products_groups.csv',\n", + " 'bdc2324-data/11/11purchases.csv',\n", + " 'bdc2324-data/11/11representation_category_capacities.csv',\n", + " 'bdc2324-data/11/11representations.csv',\n", + " 'bdc2324-data/11/11seasons.csv',\n", + " 'bdc2324-data/11/11structure_tag_mappings.csv',\n", + " 'bdc2324-data/11/11suppliers.csv',\n", + " 'bdc2324-data/11/11tags.csv',\n", + " 'bdc2324-data/11/11target_types.csv',\n", + " 'bdc2324-data/11/11targets.csv',\n", + " 'bdc2324-data/11/11tickets.csv']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BUCKET = \"bdc2324-data/11\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "markdown", + "id": "779da86b-ac61-4c61-88d2-fa1c0c19efce", + "metadata": {}, + "source": [ + "## Type de client au globale" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d22aa131-5069-43d4-a42e-24f38cc7240d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n", + " 'extra_field'],\n", + " dtype='object')\n", + "(124302, 7)\n", + "\n", + "RangeIndex: 124302 entries, 0 to 124301\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 124302 non-null int64 \n", + " 1 customer_id 124302 non-null int64 \n", + " 2 target_id 124302 non-null int64 \n", + " 3 created_at 124296 non-null object \n", + " 4 updated_at 124296 non-null object \n", + " 5 name 0 non-null float64\n", + " 6 extra_field 0 non-null float64\n", + "dtypes: float64(2), int64(3), object(2)\n", + "memory usage: 6.6+ MB\n" + ] + } + ], + "source": [ + "# Segmentation existante\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11customer_target_mappings.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customer_target_mappings = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(customer_target_mappings.columns)\n", + "print(customer_target_mappings.shape)\n", + "customer_target_mappings.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "967b20e2-5a30-4724-989f-b9e39c7c67e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_idcreated_atupdated_atnameextra_field
07938893441511012022-09-29 17:55:41.083666+02:002022-09-29 17:55:41.083666+02:00NaNNaN
17938903441521012022-09-29 19:16:07.252114+02:002022-09-29 19:16:07.252114+02:00NaNNaN
27938913441531012022-09-29 19:55:10.443450+02:002022-09-29 19:55:10.443450+02:00NaNNaN
37938923441541012022-09-29 20:16:08.269407+02:002022-09-29 20:16:08.269407+02:00NaNNaN
47938933441551012022-09-29 21:03:40.541998+02:002022-09-29 21:03:40.541998+02:00NaNNaN
........................
1242977420013298551012022-07-11 18:17:09.607162+02:002022-07-11 18:17:09.607162+02:00NaNNaN
1242987420023298561012022-07-11 18:44:45.636248+02:002022-07-11 18:44:45.636248+02:00NaNNaN
1242997420003298541012022-07-11 17:46:48.914507+02:002022-07-11 17:46:48.914507+02:00NaNNaN
1243007420033298571342022-07-11 18:44:55.915889+02:002022-07-11 18:44:55.915889+02:00NaNNaN
1243017419963298501012022-07-11 16:52:37.227487+02:002022-07-11 16:52:37.227487+02:00NaNNaN
\n", + "

124302 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id target_id created_at \\\n", + "0 793889 344151 101 2022-09-29 17:55:41.083666+02:00 \n", + "1 793890 344152 101 2022-09-29 19:16:07.252114+02:00 \n", + "2 793891 344153 101 2022-09-29 19:55:10.443450+02:00 \n", + "3 793892 344154 101 2022-09-29 20:16:08.269407+02:00 \n", + "4 793893 344155 101 2022-09-29 21:03:40.541998+02:00 \n", + "... ... ... ... ... \n", + "124297 742001 329855 101 2022-07-11 18:17:09.607162+02:00 \n", + "124298 742002 329856 101 2022-07-11 18:44:45.636248+02:00 \n", + "124299 742000 329854 101 2022-07-11 17:46:48.914507+02:00 \n", + "124300 742003 329857 134 2022-07-11 18:44:55.915889+02:00 \n", + "124301 741996 329850 101 2022-07-11 16:52:37.227487+02:00 \n", + "\n", + " updated_at name extra_field \n", + "0 2022-09-29 17:55:41.083666+02:00 NaN NaN \n", + "1 2022-09-29 19:16:07.252114+02:00 NaN NaN \n", + "2 2022-09-29 19:55:10.443450+02:00 NaN NaN \n", + "3 2022-09-29 20:16:08.269407+02:00 NaN NaN \n", + "4 2022-09-29 21:03:40.541998+02:00 NaN NaN \n", + "... ... ... ... \n", + "124297 2022-07-11 18:17:09.607162+02:00 NaN NaN \n", + "124298 2022-07-11 18:44:45.636248+02:00 NaN NaN \n", + "124299 2022-07-11 17:46:48.914507+02:00 NaN NaN \n", + "124300 2022-07-11 18:44:55.915889+02:00 NaN NaN \n", + "124301 2022-07-11 16:52:37.227487+02:00 NaN NaN \n", + "\n", + "[124302 rows x 7 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_target_mappings" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", + "(4, 6)\n", + "\n", + "RangeIndex: 4 entries, 0 to 3\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 4 non-null int64 \n", + " 1 is_import 4 non-null bool \n", + " 2 name 4 non-null object\n", + " 3 created_at 4 non-null object\n", + " 4 updated_at 4 non-null object\n", + " 5 identifier 4 non-null object\n", + "dtypes: bool(1), int64(1), object(4)\n", + "memory usage: 292.0+ bytes\n" + ] + } + ], + "source": [ + "# Segmentation existante\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11target_types.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " target_types = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(target_types.columns)\n", + "print(target_types.shape)\n", + "target_types.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idis_importnamecreated_atupdated_atidentifier
01Falsemanual_static_filter2021-04-29 13:42:14.111085+02:002021-04-29 13:42:14.111085+02:00fb27e81baa4debc6a4e1a8639c20e808
13Truemanual_structure2021-05-07 15:20:00.626650+02:002021-05-07 15:20:00.626650+02:00382bca214204a2d3462f5ec2728d5d1e
26Falsemanual_dynamic_filter2021-09-09 14:27:47.641302+02:002021-09-09 14:27:47.641302+02:00e0f4b8693184850fefd6d2a38f10584e
32Truemanual_import2021-04-29 13:49:30.107110+02:002021-04-29 13:49:30.107110+02:0012213df2ce68a624e4c0070521437bac
\n", + "
" + ], + "text/plain": [ + " id is_import name created_at \\\n", + "0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n", + "1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n", + "2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n", + "3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n", + "\n", + " updated_at identifier \n", + "0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n", + "1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n", + "2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n", + "3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_types" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2", + "metadata": {}, + "outputs": [], + "source": [ + "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}