{ "cells": [ { "cell_type": "markdown", "id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "15103481-8d74-404c-aa09-7601fe7730da", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re" ] }, { "cell_type": "markdown", "id": "ee97665c-39af-4c1c-a62b-c9c79feae18f", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "markdown", "id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a", "metadata": {}, "source": [ "# Exemple sur Company 1" ] }, { "cell_type": "markdown", "id": "db26e59a-927c-407e-b54b-1815473b0b34", "metadata": {}, "source": [ "## Chargement données" ] }, { "cell_type": "code", "execution_count": 3, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/1\"\n", "liste_database = fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "aaf64d60-bf92-470c-8210-d09abd6a653e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1/1campaign_stats.csv',\n", " 'bdc2324-data/1/1campaigns.csv',\n", " 'bdc2324-data/1/1categories.csv',\n", " 'bdc2324-data/1/1countries.csv',\n", " 'bdc2324-data/1/1currencies.csv',\n", " 'bdc2324-data/1/1customer_target_mappings.csv',\n", " 'bdc2324-data/1/1customersplus.csv',\n", " 'bdc2324-data/1/1event_types.csv',\n", " 'bdc2324-data/1/1events.csv',\n", " 'bdc2324-data/1/1facilities.csv',\n", " 'bdc2324-data/1/1link_stats.csv',\n", " 'bdc2324-data/1/1pricing_formulas.csv',\n", " 'bdc2324-data/1/1product_packs.csv',\n", " 'bdc2324-data/1/1products.csv',\n", " 'bdc2324-data/1/1products_groups.csv',\n", " 'bdc2324-data/1/1purchases.csv',\n", " 'bdc2324-data/1/1representation_category_capacities.csv',\n", " 'bdc2324-data/1/1representations.csv',\n", " 'bdc2324-data/1/1seasons.csv',\n", " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", " 'bdc2324-data/1/1suppliers.csv',\n", " 'bdc2324-data/1/1tags.csv',\n", " 'bdc2324-data/1/1target_types.csv',\n", " 'bdc2324-data/1/1targets.csv',\n", " 'bdc2324-data/1/1tickets.csv',\n", " 'bdc2324-data/1/1type_of_categories.csv',\n", " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", " 'bdc2324-data/1/1type_ofs.csv']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "liste_database" ] }, { "cell_type": "code", "execution_count": 5, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_50143/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } ], "source": [ "# loop to create dataframes from liste\n", "files_path = liste_database\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "markdown", "id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716", "metadata": {}, "source": [ "## Cleaning functions" ] }, { "cell_type": "code", "execution_count": 6, "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d", "metadata": {}, "outputs": [], "source": [ "def cleaning_date(df, column_name):\n", " \"\"\"\n", " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n", "\n", " Parameters:\n", " - df: DataFrame\n", " Le DataFrame contenant la colonne à nettoyer.\n", " - column_name: str\n", " Le nom de la colonne à nettoyer.\n", "\n", " Returns:\n", " - DataFrame\n", " Le DataFrame modifié avec la colonne nettoyée.\n", " \"\"\"\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", " return df" ] }, { "cell_type": "markdown", "id": "398804d8-2225-4fd3-bceb-75ab1588e359", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "markdown", "id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6", "metadata": {}, "source": [ "## customer_plus" ] }, { "cell_type": "code", "execution_count": null, "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656", "metadata": {}, "source": [ "## Ticket area" ] }, { "cell_type": "code", "execution_count": 7, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], "source": [ "# Fonction de nettoyage et selection\n", "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", " # Base des tickets\n", " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "\n", " # Base des fournisseurs\n", " suppliers = suppliers[['id', 'name']]\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", "\n", " # Base des types de billets\n", " # type_ofs = type_ofs[['id', 'name', 'children']]\n", " # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", "\n", " # Base des achats\n", " # Nettoyage de la date d'achat\n", " cleaning_date(purchases, 'purchase_date')\n", " # Selection des variables\n", " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", "\n", " # Fusions \n", " # Fusion avec fournisseurs\n", " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", " \n", " # # Fusion avec type de tickets\n", " # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", " # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", " \n", " # Fusion avec achats\n", " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", "\n", " return ticket_information" ] }, { "cell_type": "code", "execution_count": 8, "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_50143/1320335767.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "/tmp/ipykernel_50143/1320335767.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n" ] } ], "source": [ "df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" ] }, { "cell_type": "code", "execution_count": 70, "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | ticket_id | \n", "product_id | \n", "is_from_subscription | \n", "type_of | \n", "supplier_name | \n", "purchase_date | \n", "customer_id | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "13070859 | \n", "225251 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2018-12-28 14:47:50+00:00 | \n", "48187 | \n", "
1 | \n", "13070860 | \n", "224914 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2018-12-28 14:47:50+00:00 | \n", "48187 | \n", "
2 | \n", "13070861 | \n", "224914 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2018-12-28 14:47:50+00:00 | \n", "48187 | \n", "
3 | \n", "13070862 | \n", "224914 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2018-12-28 14:47:50+00:00 | \n", "48187 | \n", "
4 | \n", "13070863 | \n", "224914 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2018-12-28 14:47:50+00:00 | \n", "48187 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1826667 | \n", "20662815 | \n", "405689 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2023-11-08 17:23:54+00:00 | \n", "1256135 | \n", "
1826668 | \n", "20662816 | \n", "403658 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2023-11-08 18:32:18+00:00 | \n", "1256136 | \n", "
1826669 | \n", "20662817 | \n", "403658 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2023-11-08 18:32:18+00:00 | \n", "1256136 | \n", "
1826670 | \n", "20662818 | \n", "403658 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2023-11-08 19:30:28+00:00 | \n", "1256137 | \n", "
1826671 | \n", "20662819 | \n", "403658 | \n", "False | \n", "1 | \n", "vente en ligne | \n", "2023-11-08 19:30:28+00:00 | \n", "1256137 | \n", "
1826672 rows × 7 columns
\n", "\n", " | customer_id | \n", "
---|---|
target_name | \n", "\n", " |
consentement optin mediation specialisee | \n", "150000 | \n", "
consentement optin jeune public | \n", "149979 | \n", "
consentement optin b2c | \n", "108909 | \n", "
Arenametrix_bascule tel vers sib | \n", "35216 | \n", "
consentement optout b2c | \n", "34523 | \n", "
... | \n", "... | \n", "
Automation_parrainage_newsletter_handicap_visuel | \n", "1 | \n", "
consentement optout mediation specialisee | \n", "1 | \n", "
Inscrits NL LSF formulaire | \n", "1 | \n", "
Market auto - contacts inactifs post-scénario | \n", "1 | \n", "
Inactifs - fin du scénario | \n", "1 | \n", "
283 rows × 1 columns
\n", "\n", " | customer_id | \n", "
---|---|
target_name | \n", "\n", " |
Arenametrix_bascule tel vers sib | \n", "35216 | \n", "
Autres_interet_exposition | \n", "1021 | \n", "
COM Inscrits NL générale (historique) | \n", "23005 | \n", "
Contacts_prenomsdoubles | \n", "11643 | \n", "
DDCP MD Procès du Siècle | \n", "1684 | \n", "
DDCP Newsletter centres de loisirs | \n", "1032 | \n", "
DDCP Newsletter enseignants | \n", "4510 | \n", "
DDCP Newsletter jeune public | \n", "3862 | \n", "
DDCP Newsletter relais champ social | \n", "2270 | \n", "
DDCP PROMO Participants ateliers (adultes et enfants) | \n", "1954 | \n", "
DDCP billets famille | \n", "3609 | \n", "
DDCP promo MD pass musées dps oct 2018 | \n", "1785 | \n", "
DDCP promo Plan B 2019 (concerts) | \n", "1948 | \n", "
DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) | \n", "1293 | \n", "
DDCP rentrée culturelle 2023 | \n", "1757 | \n", "
DDCP_marseille_jazz_2023 | \n", "1043 | \n", "
DRE Festival Jean Rouch | \n", "1502 | \n", "
DRE MucemLab | \n", "2302 | \n", "
DRE chercheurs | \n", "1557 | \n", "
DRE institutionnels | \n", "2229 | \n", "
FORMATION _ acheteurs optin last year | \n", "10485 | \n", "
Inscrits NL générale (export_291019 + operation_videomaton) | \n", "14086 | \n", "
Inscrits NL générale site web | \n", "3732 | \n", "
Inscrits NL jeune public site web | \n", "1249 | \n", "
Votre première liste | \n", "3715 | \n", "
consentement optin b2b | \n", "12735 | \n", "
consentement optin b2c | \n", "108909 | \n", "
consentement optin dre | \n", "4527 | \n", "
consentement optin jeune public | \n", "149979 | \n", "
consentement optin mediation specialisee | \n", "150000 | \n", "
consentement optin newsletter generale | \n", "22095 | \n", "
consentement optin scolaires | \n", "4849 | \n", "
consentement optout b2b | \n", "14219 | \n", "
consentement optout b2c | \n", "34523 | \n", "
consentement optout dre | \n", "14328 | \n", "
consentement optout newsletter generale | \n", "18855 | \n", "
consentement optout scolaires | \n", "15744 | \n", "
ddcp_md_scene_ouverte_au_talent | \n", "1577 | \n", "
ddcp_promo_MD_billet_musée_oct_2019_agarder2 | \n", "5482 | \n", "
ddcp_promo_md_musée_dps 011019 | \n", "6010 | \n", "
ddcp_promo_visiteurs occasionnels_musee_8mois | \n", "6640 | \n", "
ddcp_visiteurs dps 010622 | \n", "12355 | \n", "
festival_jean_rouch | \n", "1502 | \n", "
rappel po barvalo | \n", "1248 | \n", "
structures_etiquette champ social | \n", "1488 | \n", "
\n", " | id | \n", "customer_id | \n", "opened_at | \n", "sent_at | \n", "delivered_at | \n", "campaign_name | \n", "campaign_service_id | \n", "campaign_sent_at | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "19793 | \n", "112597 | \n", "NaT | \n", "2021-03-28 16:01:09+00:00 | \n", "2021-03-28 16:24:18+00:00 | \n", "Le Mucem chez vous, gardons le lien #22 | \n", "404 | \n", "2021-03-27 23:00:00+00:00 | \n", "
1 | \n", "14211 | \n", "113666 | \n", "NaT | \n", "2021-03-28 16:01:09+00:00 | \n", "2021-03-28 16:21:02+00:00 | \n", "Le Mucem chez vous, gardons le lien #22 | \n", "404 | \n", "2021-03-27 23:00:00+00:00 | \n", "
2 | \n", "13150 | \n", "280561 | \n", "NaT | \n", "2021-03-28 16:00:59+00:00 | \n", "2021-03-28 16:08:45+00:00 | \n", "Le Mucem chez vous, gardons le lien #22 | \n", "404 | \n", "2021-03-27 23:00:00+00:00 | \n", "
3 | \n", "7073 | \n", "101007 | \n", "2021-03-28 18:11:06+00:00 | \n", "2021-03-28 16:00:59+00:00 | \n", "2021-03-28 16:09:47+00:00 | \n", "Le Mucem chez vous, gardons le lien #22 | \n", "404 | \n", "2021-03-27 23:00:00+00:00 | \n", "
4 | \n", "5175 | \n", "103972 | \n", "NaT | \n", "2021-03-28 16:01:06+00:00 | \n", "2021-03-28 16:05:03+00:00 | \n", "Le Mucem chez vous, gardons le lien #22 | \n", "404 | \n", "2021-03-27 23:00:00+00:00 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
6214803 | \n", "8302994 | \n", "266155 | \n", "2023-10-23 09:43:25+00:00 | \n", "2023-10-23 09:32:33+00:00 | \n", "2023-10-23 09:32:34+00:00 | \n", "dre_nov_2023 | \n", "1318 | \n", "2023-10-23 09:31:17+00:00 | \n", "
6214804 | \n", "8303307 | \n", "21355 | \n", "2023-10-23 09:44:02+00:00 | \n", "2023-10-23 09:32:49+00:00 | \n", "2023-10-23 09:32:49+00:00 | \n", "dre_nov_2023 | \n", "1318 | \n", "2023-10-23 09:31:17+00:00 | \n", "
6214805 | \n", "8304346 | \n", "21849 | \n", "2023-10-23 09:45:52+00:00 | \n", "2023-10-23 09:33:28+00:00 | \n", "2023-10-23 09:33:29+00:00 | \n", "dre_nov_2023 | \n", "1318 | \n", "2023-10-23 09:31:17+00:00 | \n", "
6214806 | \n", "8302037 | \n", "667789 | \n", "2023-10-23 09:47:32+00:00 | \n", "2023-10-23 09:31:53+00:00 | \n", "2023-10-23 09:31:54+00:00 | \n", "dre_nov_2023 | \n", "1318 | \n", "2023-10-23 09:31:17+00:00 | \n", "
6214807 | \n", "8304939 | \n", "294154 | \n", "NaT | \n", "2023-10-23 09:33:54+00:00 | \n", "2023-10-23 09:33:55+00:00 | \n", "dre_nov_2023 | \n", "1318 | \n", "2023-10-23 09:31:17+00:00 | \n", "
6214808 rows × 8 columns
\n", "\n", " | customer_id | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "time_to_open | \n", "
---|---|---|---|---|
0 | \n", "2 | \n", "4 | \n", "0.0 | \n", "NaT | \n", "
1 | \n", "3 | \n", "222 | \n", "124.0 | \n", "1 days 00:28:30.169354838 | \n", "
2 | \n", "4 | \n", "7 | \n", "7.0 | \n", "1 days 04:31:01.428571428 | \n", "
3 | \n", "5 | \n", "4 | \n", "0.0 | \n", "NaT | \n", "
4 | \n", "6 | \n", "20 | \n", "0.0 | \n", "NaT | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
130467 | \n", "1256097 | \n", "1 | \n", "1.0 | \n", "0 days 02:11:15 | \n", "
130468 | \n", "1256098 | \n", "1 | \n", "0.0 | \n", "NaT | \n", "
130469 | \n", "1256099 | \n", "1 | \n", "0.0 | \n", "NaT | \n", "
130470 | \n", "1256100 | \n", "1 | \n", "0.0 | \n", "NaT | \n", "
130471 | \n", "1256101 | \n", "1 | \n", "0.0 | \n", "NaT | \n", "
130472 rows × 4 columns
\n", "