diff --git a/Notebook_Fanta.ipynb b/Notebook_Fanta.ipynb index 363fcab..5fd0f44 100644 --- a/Notebook_Fanta.ipynb +++ b/Notebook_Fanta.ipynb @@ -1,6 +1,823 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "markdown", + "id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b029d42-fb02-481e-a407-7e41886198a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import s3fs\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement des fichiers campaign_stats.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e", + "metadata": {}, + "outputs": [], + "source": [ + "# Conversion des dates 'sent_at'\n", + "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-11-09 18:10:45+00:00\n", + "2020-06-02 08:24:08+00:00\n", + "2023-10-12 01:39:48+00:00\n", + "2023-10-10 17:06:29+00:00\n", + "2023-11-01 09:20:48+00:00\n", + "2021-03-31 14:59:02+00:00\n" + ] + } + ], + "source": [ + "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", + "print(campaign_stats_1['sent_at'].max())\n", + "print(campaign_stats_1['sent_at'].min())\n", + "\n", + "print(campaign_stats_2['sent_at'].max())\n", + "print(campaign_stats_2['sent_at'].min())\n", + "\n", + "print(campaign_stats_3['sent_at'].max())\n", + "print(campaign_stats_3['sent_at'].min())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c75632df-b018-4bb8-a99d-83f15af94369", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2021-03-28 16:01:09+00:00\n", + "1 2021-03-28 16:01:09+00:00\n", + "2 2021-03-28 16:00:59+00:00\n", + "3 2021-03-28 16:00:59+00:00\n", + "4 2021-03-28 16:01:06+00:00\n", + " ... \n", + "6214803 2023-10-23 09:32:33+00:00\n", + "6214804 2023-10-23 09:32:49+00:00\n", + "6214805 2023-10-23 09:33:28+00:00\n", + "6214806 2023-10-23 09:31:53+00:00\n", + "6214807 2023-10-23 09:33:54+00:00\n", + "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "campaign_stats_1['sent_at']" + ] + }, + { + "cell_type": "markdown", + "id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22", + "metadata": {}, + "source": [ + "### Customersplus.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d3bf880d-1065-4d5b-9954-1830aa5081af", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" + ] + } + ], + "source": [ + "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7368f381-db8e-4a4d-9fe2-5947eb55be58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", + " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", + " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", + " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", + " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", + " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", + " 'average_purchase_delay', 'average_price_basket',\n", + " 'average_ticket_basket', 'total_price', 'preferred_category',\n", + " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", + " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", + " 'tenant_id'],\n", + " dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customers_plus_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08091935-b159-47fa-806c-e1444f3b227e", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f8c8868-c1ac-4cee-af08-533d928f6764", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf95daf2-4852-4718-b474-207a1ebd8ac4", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_2['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1425c385-3216-4e4f-ae8f-a121624721ba", + "metadata": {}, + "outputs": [], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "92533026-e27c-4f1f-81ca-64eda32a34c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", + "# Exemple id commun = caractéristiques communes\n", + "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", + "\n", + "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0.000000\n", + "lastname 43.461341\n", + "firstname 44.995588\n", + "birthdate 96.419870\n", + "email 8.622075\n", + "street_id 0.000000\n", + "created_at 0.000000\n", + "updated_at 0.000000\n", + "civility 100.000000\n", + "is_partner 0.000000\n", + "extra 100.000000\n", + "deleted_at 100.000000\n", + "reference 100.000000\n", + "gender 0.000000\n", + "is_email_true 0.000000\n", + "extra_field 100.000000\n", + "identifier 0.000000\n", + "opt_in 0.000000\n", + "structure_id 88.072380\n", + "note 99.403421\n", + "profession 95.913503\n", + "language 99.280945\n", + "mcp_contact_id 34.876141\n", + "need_reload 0.000000\n", + "last_buying_date 51.653431\n", + "max_price 51.653431\n", + "ticket_sum 0.000000\n", + "average_price 8.639195\n", + "fidelity 0.000000\n", + "average_purchase_delay 51.653431\n", + "average_price_basket 51.653431\n", + "average_ticket_basket 51.653431\n", + "total_price 43.014236\n", + "preferred_category 100.000000\n", + "preferred_supplier 100.000000\n", + "preferred_formula 100.000000\n", + "purchase_count 0.000000\n", + "first_buying_date 51.653431\n", + "last_visiting_date 100.000000\n", + "zipcode 71.176564\n", + "country 5.459418\n", + "age 96.419870\n", + "tenant_id 0.000000\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "pd.DataFrame(customers_plus_1.isna().mean()*100)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6d62e73f-3925-490f-9fd4-d0e838903cb2", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement de toutes les données\n", + "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", + "\n", + "for nom_base in liste_base:\n", + " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "lastname | \n", + "firstname | \n", + "birthdate | \n", + "street_id | \n", + "created_at | \n", + "updated_at | \n", + "civility | \n", + "is_partner | \n", + "... | \n", + "tenant_id | \n", + "id_x | \n", + "customer_id | \n", + "purchase_date | \n", + "type_of | \n", + "is_from_subscription | \n", + "amount | \n", + "is_full_price | \n", + "start_date_time | \n", + "event_name | \n", + "|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "405082 | \n", + "lastname405082 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "6 | \n", + "2023-01-12 06:30:31.197484+01:00 | \n", + "2023-01-12 06:30:31.197484+01:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "992423 | \n", + "405082 | \n", + "2023-01-11 17:08:41+01:00 | \n", + "3 | \n", + "False | \n", + "13.0 | \n", + "False | \n", + "2023-02-06 20:00:00+01:00 | \n", + "zaide | \n", + "
1 | \n", + "405082 | \n", + "lastname405082 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "6 | \n", + "2023-01-12 06:30:31.197484+01:00 | \n", + "2023-01-12 06:30:31.197484+01:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "992423 | \n", + "405082 | \n", + "2023-01-11 17:08:41+01:00 | \n", + "3 | \n", + "False | \n", + "13.0 | \n", + "False | \n", + "2023-02-06 20:00:00+01:00 | \n", + "zaide | \n", + "
2 | \n", + "411168 | \n", + "lastname411168 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "6 | \n", + "2023-03-17 06:30:35.431967+01:00 | \n", + "2023-03-17 06:30:35.431967+01:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1053934 | \n", + "411168 | \n", + "2023-03-16 16:23:10+01:00 | \n", + "3 | \n", + "False | \n", + "62.0 | \n", + "False | \n", + "2023-03-19 16:00:00+01:00 | \n", + "luisa miller | \n", + "
3 | \n", + "411168 | \n", + "lastname411168 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "6 | \n", + "2023-03-17 06:30:35.431967+01:00 | \n", + "2023-03-17 06:30:35.431967+01:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1053934 | \n", + "411168 | \n", + "2023-03-16 16:23:10+01:00 | \n", + "3 | \n", + "False | \n", + "62.0 | \n", + "False | \n", + "2023-03-19 16:00:00+01:00 | \n", + "luisa miller | \n", + "
4 | \n", + "4380 | \n", + "lastname4380 | \n", + "firstname4380 | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "2021-04-22 14:51:55.432952+02:00 | \n", + "2022-04-14 11:41:33.738500+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1189141 | \n", + "4380 | \n", + "2020-11-26 13:12:53+01:00 | \n", + "3 | \n", + "False | \n", + "51.3 | \n", + "False | \n", + "2020-12-01 20:00:00+01:00 | \n", + "iphigenie en tauride | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
318964 | \n", + "19095 | \n", + "lastname19095 | \n", + "firstname19095 | \n", + "1979-07-16 | \n", + "email19095 | \n", + "6 | \n", + "2021-04-22 15:06:30.120537+02:00 | \n", + "2023-09-12 18:27:36.904104+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1090839 | \n", + "19095 | \n", + "2019-05-19 21:18:36+02:00 | \n", + "1 | \n", + "False | \n", + "4.5 | \n", + "False | \n", + "2019-05-27 20:00:00+02:00 | \n", + "entre femmes | \n", + "
318965 | \n", + "19095 | \n", + "lastname19095 | \n", + "firstname19095 | \n", + "1979-07-16 | \n", + "email19095 | \n", + "6 | \n", + "2021-04-22 15:06:30.120537+02:00 | \n", + "2023-09-12 18:27:36.904104+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1090839 | \n", + "19095 | \n", + "2019-05-19 21:18:36+02:00 | \n", + "1 | \n", + "False | \n", + "4.5 | \n", + "False | \n", + "2019-05-27 20:00:00+02:00 | \n", + "entre femmes | \n", + "
318966 | \n", + "19095 | \n", + "lastname19095 | \n", + "firstname19095 | \n", + "1979-07-16 | \n", + "email19095 | \n", + "6 | \n", + "2021-04-22 15:06:30.120537+02:00 | \n", + "2023-09-12 18:27:36.904104+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1090839 | \n", + "19095 | \n", + "2019-05-19 21:18:36+02:00 | \n", + "1 | \n", + "False | \n", + "4.5 | \n", + "False | \n", + "2019-05-27 20:00:00+02:00 | \n", + "entre femmes | \n", + "
318967 | \n", + "19095 | \n", + "lastname19095 | \n", + "firstname19095 | \n", + "1979-07-16 | \n", + "email19095 | \n", + "6 | \n", + "2021-04-22 15:06:30.120537+02:00 | \n", + "2023-09-12 18:27:36.904104+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1244277 | \n", + "19095 | \n", + "2019-12-31 11:04:07+01:00 | \n", + "1 | \n", + "False | \n", + "5.5 | \n", + "False | \n", + "2020-02-03 20:00:00+01:00 | \n", + "a boire et a manger | \n", + "
318968 | \n", + "19095 | \n", + "lastname19095 | \n", + "firstname19095 | \n", + "1979-07-16 | \n", + "email19095 | \n", + "6 | \n", + "2021-04-22 15:06:30.120537+02:00 | \n", + "2023-09-12 18:27:36.904104+02:00 | \n", + "NaN | \n", + "False | \n", + "... | \n", + "1556 | \n", + "1244277 | \n", + "19095 | \n", + "2019-12-31 11:04:07+01:00 | \n", + "1 | \n", + "False | \n", + "5.5 | \n", + "False | \n", + "2020-02-03 20:00:00+01:00 | \n", + "a boire et a manger | \n", + "
318969 rows × 52 columns
\n", + "