{ "cells": [ { "cell_type": "markdown", "id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "\n" ] }, { "cell_type": "markdown", "id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "0b029d42-fb02-481e-a407-7e41886198a6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1',\n", " 'bdc2324-data/10',\n", " 'bdc2324-data/101',\n", " 'bdc2324-data/11',\n", " 'bdc2324-data/12',\n", " 'bdc2324-data/13',\n", " 'bdc2324-data/14',\n", " 'bdc2324-data/2',\n", " 'bdc2324-data/3',\n", " 'bdc2324-data/4',\n", " 'bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b", "metadata": {}, "outputs": [], "source": [ "# Chargement des fichiers campaign_stats.csv\n", "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e", "metadata": {}, "outputs": [], "source": [ "# Conversion des dates 'sent_at'\n", "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023-11-09 18:10:45+00:00\n", "2020-06-02 08:24:08+00:00\n", "2023-10-12 01:39:48+00:00\n", "2023-10-10 17:06:29+00:00\n", "2023-11-01 09:20:48+00:00\n", "2021-03-31 14:59:02+00:00\n" ] } ], "source": [ "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", "print(campaign_stats_1['sent_at'].max())\n", "print(campaign_stats_1['sent_at'].min())\n", "\n", "print(campaign_stats_2['sent_at'].max())\n", "print(campaign_stats_2['sent_at'].min())\n", "\n", "print(campaign_stats_3['sent_at'].max())\n", "print(campaign_stats_3['sent_at'].min())" ] }, { "cell_type": "code", "execution_count": 7, "id": "c75632df-b018-4bb8-a99d-83f15af94369", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-03-28 16:01:09+00:00\n", "1 2021-03-28 16:01:09+00:00\n", "2 2021-03-28 16:00:59+00:00\n", "3 2021-03-28 16:00:59+00:00\n", "4 2021-03-28 16:01:06+00:00\n", " ... \n", "6214803 2023-10-23 09:32:33+00:00\n", "6214804 2023-10-23 09:32:49+00:00\n", "6214805 2023-10-23 09:33:28+00:00\n", "6214806 2023-10-23 09:31:53+00:00\n", "6214807 2023-10-23 09:33:54+00:00\n", "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "campaign_stats_1['sent_at']" ] }, { "cell_type": "markdown", "id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22", "metadata": {}, "source": [ "### Customersplus.csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "d3bf880d-1065-4d5b-9954-1830aa5081af", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "7368f381-db8e-4a4d-9fe2-5947eb55be58", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", " 'average_purchase_delay', 'average_price_basket',\n", " 'average_ticket_basket', 'total_price', 'preferred_category',\n", " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", " 'tenant_id'],\n", " dtype='object')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus_1.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "08091935-b159-47fa-806c-e1444f3b227e", "metadata": {}, "outputs": [], "source": [ "customers_plus_1.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "9f8c8868-c1ac-4cee-af08-533d928f6764", "metadata": {}, "outputs": [], "source": [ "customers_plus_1['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "bf95daf2-4852-4718-b474-207a1ebd8ac4", "metadata": {}, "outputs": [], "source": [ "customers_plus_2['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "1425c385-3216-4e4f-ae8f-a121624721ba", "metadata": {}, "outputs": [], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" ] }, { "cell_type": "code", "execution_count": 61, "id": "92533026-e27c-4f1f-81ca-64eda32a34c0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", "# Exemple id commun = caractéristiques communes\n", "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", "\n", "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" ] }, { "cell_type": "code", "execution_count": 49, "id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id 0.000000\n", "lastname 43.461341\n", "firstname 44.995588\n", "birthdate 96.419870\n", "email 8.622075\n", "street_id 0.000000\n", "created_at 0.000000\n", "updated_at 0.000000\n", "civility 100.000000\n", "is_partner 0.000000\n", "extra 100.000000\n", "deleted_at 100.000000\n", "reference 100.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "extra_field 100.000000\n", "identifier 0.000000\n", "opt_in 0.000000\n", "structure_id 88.072380\n", "note 99.403421\n", "profession 95.913503\n", "language 99.280945\n", "mcp_contact_id 34.876141\n", "need_reload 0.000000\n", "last_buying_date 51.653431\n", "max_price 51.653431\n", "ticket_sum 0.000000\n", "average_price 8.639195\n", "fidelity 0.000000\n", "average_purchase_delay 51.653431\n", "average_price_basket 51.653431\n", "average_ticket_basket 51.653431\n", "total_price 43.014236\n", "preferred_category 100.000000\n", "preferred_supplier 100.000000\n", "preferred_formula 100.000000\n", "purchase_count 0.000000\n", "first_buying_date 51.653431\n", "last_visiting_date 100.000000\n", "zipcode 71.176564\n", "country 5.459418\n", "age 96.419870\n", "tenant_id 0.000000\n", "dtype: float64\n" ] } ], "source": [ "pd.DataFrame(customers_plus_1.isna().mean()*100)" ] }, { "cell_type": "code", "execution_count": 11, "id": "6d62e73f-3925-490f-9fd4-d0e838903cb2", "metadata": {}, "outputs": [], "source": [ "# Chargement de toutes les données\n", "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", "\n", "for nom_base in liste_base:\n", " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "lastname | \n", "firstname | \n", "birthdate | \n", "street_id | \n", "created_at | \n", "updated_at | \n", "civility | \n", "is_partner | \n", "... | \n", "tenant_id | \n", "id_x | \n", "customer_id | \n", "purchase_date | \n", "type_of | \n", "is_from_subscription | \n", "amount | \n", "is_full_price | \n", "start_date_time | \n", "event_name | \n", "|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "405082 | \n", "lastname405082 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "992423 | \n", "405082 | \n", "2023-01-11 17:08:41+01:00 | \n", "3 | \n", "False | \n", "13.0 | \n", "False | \n", "2023-02-06 20:00:00+01:00 | \n", "zaide | \n", "
1 | \n", "405082 | \n", "lastname405082 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "992423 | \n", "405082 | \n", "2023-01-11 17:08:41+01:00 | \n", "3 | \n", "False | \n", "13.0 | \n", "False | \n", "2023-02-06 20:00:00+01:00 | \n", "zaide | \n", "
2 | \n", "411168 | \n", "lastname411168 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1053934 | \n", "411168 | \n", "2023-03-16 16:23:10+01:00 | \n", "3 | \n", "False | \n", "62.0 | \n", "False | \n", "2023-03-19 16:00:00+01:00 | \n", "luisa miller | \n", "
3 | \n", "411168 | \n", "lastname411168 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1053934 | \n", "411168 | \n", "2023-03-16 16:23:10+01:00 | \n", "3 | \n", "False | \n", "62.0 | \n", "False | \n", "2023-03-19 16:00:00+01:00 | \n", "luisa miller | \n", "
4 | \n", "4380 | \n", "lastname4380 | \n", "firstname4380 | \n", "NaN | \n", "NaN | \n", "1 | \n", "2021-04-22 14:51:55.432952+02:00 | \n", "2022-04-14 11:41:33.738500+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1189141 | \n", "4380 | \n", "2020-11-26 13:12:53+01:00 | \n", "3 | \n", "False | \n", "51.3 | \n", "False | \n", "2020-12-01 20:00:00+01:00 | \n", "iphigenie en tauride | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
318964 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318965 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318966 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318967 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1244277 | \n", "19095 | \n", "2019-12-31 11:04:07+01:00 | \n", "1 | \n", "False | \n", "5.5 | \n", "False | \n", "2020-02-03 20:00:00+01:00 | \n", "a boire et a manger | \n", "
318968 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1244277 | \n", "19095 | \n", "2019-12-31 11:04:07+01:00 | \n", "1 | \n", "False | \n", "5.5 | \n", "False | \n", "2020-02-03 20:00:00+01:00 | \n", "a boire et a manger | \n", "
318969 rows × 52 columns
\n", "