{ "cells": [ { "cell_type": "markdown", "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1',\n", " 'bdc2324-data/10',\n", " 'bdc2324-data/101',\n", " 'bdc2324-data/11',\n", " 'bdc2324-data/12',\n", " 'bdc2324-data/13',\n", " 'bdc2324-data/14',\n", " 'bdc2324-data/2',\n", " 'bdc2324-data/3',\n", " 'bdc2324-data/4',\n", " 'bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", "metadata": {}, "outputs": [], "source": [ "# Chargement des fichiers campaign_stats.csv\n", "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", "metadata": {}, "outputs": [], "source": [ "# Conversion des dates 'sent_at'\n", "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023-11-09 18:10:45+00:00\n", "2020-06-02 08:24:08+00:00\n", "2023-10-12 01:39:48+00:00\n", "2023-10-10 17:06:29+00:00\n", "2023-11-01 09:20:48+00:00\n", "2021-03-31 14:59:02+00:00\n" ] } ], "source": [ "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", "print(campaign_stats_1['sent_at'].max())\n", "print(campaign_stats_1['sent_at'].min())\n", "\n", "print(campaign_stats_2['sent_at'].max())\n", "print(campaign_stats_2['sent_at'].min())\n", "\n", "print(campaign_stats_3['sent_at'].max())\n", "print(campaign_stats_3['sent_at'].min())" ] }, { "cell_type": "code", "execution_count": 7, "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-03-28 16:01:09+00:00\n", "1 2021-03-28 16:01:09+00:00\n", "2 2021-03-28 16:00:59+00:00\n", "3 2021-03-28 16:00:59+00:00\n", "4 2021-03-28 16:01:06+00:00\n", " ... \n", "6214803 2023-10-23 09:32:33+00:00\n", "6214804 2023-10-23 09:32:49+00:00\n", "6214805 2023-10-23 09:33:28+00:00\n", "6214806 2023-10-23 09:31:53+00:00\n", "6214807 2023-10-23 09:33:54+00:00\n", "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "campaign_stats_1['sent_at']" ] }, { "cell_type": "markdown", "id": "31f2edbf-5661-4516-9835-06d4da615c13", "metadata": {}, "source": [ "### Customersplus.csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "460f853a-68c0-42a7-9877-b83d3aaec813", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", " 'average_purchase_delay', 'average_price_basket',\n", " 'average_ticket_basket', 'total_price', 'preferred_category',\n", " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", " 'tenant_id'],\n", " dtype='object')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus_1.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", "metadata": {}, "outputs": [], "source": [ "customers_plus_1.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", "metadata": {}, "outputs": [], "source": [ "customers_plus_1['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", "metadata": {}, "outputs": [], "source": [ "customers_plus_2['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "b40a653e-013f-48d0-8b57-0284587b36c5", "metadata": {}, "outputs": [], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" ] }, { "cell_type": "code", "execution_count": 61, "id": "32fa2215-3c79-40b5-8643-755865959fc7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", "# Exemple id commun = caractéristiques communes\n", "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", "\n", "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" ] }, { "cell_type": "code", "execution_count": 49, "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id 0.000000\n", "lastname 43.461341\n", "firstname 44.995588\n", "birthdate 96.419870\n", "email 8.622075\n", "street_id 0.000000\n", "created_at 0.000000\n", "updated_at 0.000000\n", "civility 100.000000\n", "is_partner 0.000000\n", "extra 100.000000\n", "deleted_at 100.000000\n", "reference 100.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "extra_field 100.000000\n", "identifier 0.000000\n", "opt_in 0.000000\n", "structure_id 88.072380\n", "note 99.403421\n", "profession 95.913503\n", "language 99.280945\n", "mcp_contact_id 34.876141\n", "need_reload 0.000000\n", "last_buying_date 51.653431\n", "max_price 51.653431\n", "ticket_sum 0.000000\n", "average_price 8.639195\n", "fidelity 0.000000\n", "average_purchase_delay 51.653431\n", "average_price_basket 51.653431\n", "average_ticket_basket 51.653431\n", "total_price 43.014236\n", "preferred_category 100.000000\n", "preferred_supplier 100.000000\n", "preferred_formula 100.000000\n", "purchase_count 0.000000\n", "first_buying_date 51.653431\n", "last_visiting_date 100.000000\n", "zipcode 71.176564\n", "country 5.459418\n", "age 96.419870\n", "tenant_id 0.000000\n", "dtype: float64\n" ] } ], "source": [ "pd.DataFrame(customers_plus_1.isna().mean()*100)" ] }, { "cell_type": "code", "execution_count": 11, "id": "6f6ce60d-0912-497d-9108-330acccef394", "metadata": {}, "outputs": [], "source": [ "# Chargement de toutes les données\n", "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", "\n", "for nom_base in liste_base:\n", " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "lastname | \n", "firstname | \n", "birthdate | \n", "street_id | \n", "created_at | \n", "updated_at | \n", "civility | \n", "is_partner | \n", "... | \n", "tenant_id | \n", "id_x | \n", "customer_id | \n", "purchase_date | \n", "type_of | \n", "is_from_subscription | \n", "amount | \n", "is_full_price | \n", "start_date_time | \n", "event_name | \n", "|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "405082 | \n", "lastname405082 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "992423 | \n", "405082 | \n", "2023-01-11 17:08:41+01:00 | \n", "3 | \n", "False | \n", "13.0 | \n", "False | \n", "2023-02-06 20:00:00+01:00 | \n", "zaide | \n", "
1 | \n", "405082 | \n", "lastname405082 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "2023-01-12 06:30:31.197484+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "992423 | \n", "405082 | \n", "2023-01-11 17:08:41+01:00 | \n", "3 | \n", "False | \n", "13.0 | \n", "False | \n", "2023-02-06 20:00:00+01:00 | \n", "zaide | \n", "
2 | \n", "411168 | \n", "lastname411168 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1053934 | \n", "411168 | \n", "2023-03-16 16:23:10+01:00 | \n", "3 | \n", "False | \n", "62.0 | \n", "False | \n", "2023-03-19 16:00:00+01:00 | \n", "luisa miller | \n", "
3 | \n", "411168 | \n", "lastname411168 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "2023-03-17 06:30:35.431967+01:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1053934 | \n", "411168 | \n", "2023-03-16 16:23:10+01:00 | \n", "3 | \n", "False | \n", "62.0 | \n", "False | \n", "2023-03-19 16:00:00+01:00 | \n", "luisa miller | \n", "
4 | \n", "4380 | \n", "lastname4380 | \n", "firstname4380 | \n", "NaN | \n", "NaN | \n", "1 | \n", "2021-04-22 14:51:55.432952+02:00 | \n", "2022-04-14 11:41:33.738500+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1189141 | \n", "4380 | \n", "2020-11-26 13:12:53+01:00 | \n", "3 | \n", "False | \n", "51.3 | \n", "False | \n", "2020-12-01 20:00:00+01:00 | \n", "iphigenie en tauride | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
318964 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318965 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318966 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1090839 | \n", "19095 | \n", "2019-05-19 21:18:36+02:00 | \n", "1 | \n", "False | \n", "4.5 | \n", "False | \n", "2019-05-27 20:00:00+02:00 | \n", "entre femmes | \n", "
318967 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1244277 | \n", "19095 | \n", "2019-12-31 11:04:07+01:00 | \n", "1 | \n", "False | \n", "5.5 | \n", "False | \n", "2020-02-03 20:00:00+01:00 | \n", "a boire et a manger | \n", "
318968 | \n", "19095 | \n", "lastname19095 | \n", "firstname19095 | \n", "1979-07-16 | \n", "email19095 | \n", "6 | \n", "2021-04-22 15:06:30.120537+02:00 | \n", "2023-09-12 18:27:36.904104+02:00 | \n", "NaN | \n", "False | \n", "... | \n", "1556 | \n", "1244277 | \n", "19095 | \n", "2019-12-31 11:04:07+01:00 | \n", "1 | \n", "False | \n", "5.5 | \n", "False | \n", "2020-02-03 20:00:00+01:00 | \n", "a boire et a manger | \n", "
318969 rows × 52 columns
\n", "