{ "cells": [ { "cell_type": "markdown", "id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "15103481-8d74-404c-aa09-7601fe7730da", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re" ] }, { "cell_type": "markdown", "id": "ee97665c-39af-4c1c-a62b-c9c79feae18f", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "markdown", "id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a", "metadata": {}, "source": [ "# Exemple sur Company 1" ] }, { "cell_type": "markdown", "id": "db26e59a-927c-407e-b54b-1815473b0b34", "metadata": {}, "source": [ "## Chargement données" ] }, { "cell_type": "code", "execution_count": 3, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/1\"\n", "liste_database = fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3658/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } ], "source": [ "# loop to create dataframes from liste\n", "files_path = liste_database\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "markdown", "id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716", "metadata": {}, "source": [ "## Cleaning functions" ] }, { "cell_type": "code", "execution_count": 5, "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d", "metadata": {}, "outputs": [], "source": [ "def cleaning_date(df, column_name):\n", " \"\"\"\n", " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n", "\n", " Parameters:\n", " - df: DataFrame\n", " Le DataFrame contenant la colonne à nettoyer.\n", " - column_name: str\n", " Le nom de la colonne à nettoyer.\n", "\n", " Returns:\n", " - DataFrame\n", " Le DataFrame modifié avec la colonne nettoyée.\n", " \"\"\"\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", " return df" ] }, { "cell_type": "markdown", "id": "398804d8-2225-4fd3-bceb-75ab1588e359", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "markdown", "id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6", "metadata": {}, "source": [ "## customer_plus" ] }, { "cell_type": "code", "execution_count": 6, "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f", "metadata": {}, "outputs": [], "source": [ "def preprocessing_customerplus(customerplus = None):\n", "\n", " customerplus_copy = customerplus.copy()\n", " \n", " # Passage en format date\n", " cleaning_date(customerplus_copy, 'first_buying_date')\n", " cleaning_date(customerplus_copy, 'last_visiting_date')\n", " \n", " # Selection des variables\n", " customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n", " customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n", "\n", " return customerplus_copy\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "03329e32-00a5-42c8-9470-75f7b6216ccd", "metadata": {}, "outputs": [], "source": [ "df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)" ] }, { "cell_type": "markdown", "id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656", "metadata": {}, "source": [ "## Ticket area" ] }, { "cell_type": "code", "execution_count": 8, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], "source": [ "# Fonction de nettoyage et selection\n", "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", " # Base des tickets\n", " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "\n", " # Base des fournisseurs\n", " suppliers = suppliers[['id', 'name']]\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", "\n", " # Base des types de billets\n", " type_ofs = type_ofs[['id', 'name', 'children']]\n", " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", "\n", " # Base des achats\n", " # Nettoyage de la date d'achat\n", " cleaning_date(purchases, 'purchase_date')\n", " # Selection des variables\n", " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", "\n", " # Fusions \n", " # Fusion avec fournisseurs\n", " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", " \n", " # Fusion avec type de tickets\n", " ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", " \n", " # Fusion avec achats\n", " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", "\n", " return ticket_information" ] }, { "cell_type": "code", "execution_count": 9, "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3658/1591303091.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "/tmp/ipykernel_3658/1591303091.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", "/tmp/ipykernel_3658/1591303091.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" ] } ], "source": [ "df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" ] }, { "cell_type": "code", "execution_count": 10, "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
\n", "
" ], "text/plain": [ " ticket_id product_id is_from_subscription supplier_name \\\n", "0 13070859 225251 False vente en ligne \n", "1 13070860 224914 False vente en ligne \n", "2 13070861 224914 False vente en ligne \n", "3 13070862 224914 False vente en ligne \n", "4 13070863 224914 False vente en ligne \n", "\n", " type_of_ticket_name children purchase_date customer_id \n", "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_ticket_information.head()" ] }, { "cell_type": "markdown", "id": "37499eae-1a7f-4dce-83b0-ff942ccf7a9d", "metadata": {}, "source": [ "### KPI tickets" ] }, { "cell_type": "code", "execution_count": 11, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], "source": [ "def tickets_kpi_function(tickets_information = None):\n", " tickets_information_copy = tickets_information.copy()\n", " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n", " tickets_kpi = (tickets_information_copy[['product_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max']]\n", " .groupby(['product_id', 'customer_id'])\n", " .agg({'ticket_id': 'count', \n", " 'supplier_name': 'nunique',\n", " 'purchase_date_max' : 'max',\n", " 'purchase_date' : 'min'})\n", " .reset_index()\n", " )\n", " \n", " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n", " 'supplier_name' : 'nb_suppliers', \n", " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n", " \n", " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", " \n", " return tickets_kpi\n", " " ] }, { "cell_type": "code", "execution_count": 12, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], "source": [ "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_ticket_information)" ] }, { "cell_type": "code", "execution_count": 13, "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_idcustomer_idnb_ticketsnb_supplierspurchase_date_maxpurchase_date_mintime_between_purchase
01073102805422019-06-05 14:37:13+00:002019-06-05 14:18:38+00:000 days 00:18:35
111008954355112017-02-17 13:32:51+00:002017-02-17 13:32:51+00:000 days 00:00:00
211008954356112017-03-02 14:36:16+00:002017-03-02 14:36:16+00:000 days 00:00:00
311008954357112017-03-06 15:16:41+00:002017-03-06 15:16:41+00:000 days 00:00:00
411008954358112017-03-13 16:07:27+00:002017-03-13 16:07:27+00:000 days 00:00:00
\n", "
" ], "text/plain": [ " product_id customer_id nb_tickets nb_suppliers \\\n", "0 107310 2805 4 2 \n", "1 110089 54355 1 1 \n", "2 110089 54356 1 1 \n", "3 110089 54357 1 1 \n", "4 110089 54358 1 1 \n", "\n", " purchase_date_max purchase_date_min time_between_purchase \n", "0 2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00 0 days 00:18:35 \n", "1 2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00 0 days 00:00:00 \n", "2 2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00 0 days 00:00:00 \n", "3 2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00 0 days 00:00:00 \n", "4 2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00 0 days 00:00:00 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_tickets_kpi.head()" ] }, { "cell_type": "markdown", "id": "096e47f4-1d65-4575-989d-83227eedad2b", "metadata": {}, "source": [ "## Target area" ] }, { "cell_type": "code", "execution_count": 14, "id": "baed146a-9d3a-4397-a812-3d50c9a2f038", "metadata": {}, "outputs": [], "source": [ "def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n", " # Target.csv cleaning\n", " targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n", " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n", " \n", " # target_type cleaning\n", " target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n", " \n", " #customer_target_mappings cleaning\n", " customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n", " \n", " # Merge target et target_type\n", " targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n", " targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n", " \n", " # Merge\n", " targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n", " targets_full.drop(['target_id'], axis = 1, inplace=True)\n", "\n", " return targets_full" ] }, { "cell_type": "code", "execution_count": 15, "id": "5fbfd88b-b94c-489c-9201-670e96e453e7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3658/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n" ] } ], "source": [ "df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)" ] }, { "cell_type": "code", "execution_count": 16, "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_id
target_name
consentement optin mediation specialisee150000
consentement optin jeune public149979
consentement optin b2c108909
Arenametrix_bascule tel vers sib35216
consentement optout b2c34523
\n", "
" ], "text/plain": [ " customer_id\n", "target_name \n", "consentement optin mediation specialisee 150000\n", "consentement optin jeune public 149979\n", "consentement optin b2c 108909\n", "Arenametrix_bascule tel vers sib 35216\n", "consentement optout b2c 34523" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 17, "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_id
target_name
Arenametrix_bascule tel vers sib35216
Autres_interet_exposition1021
COM Inscrits NL générale (historique)23005
Contacts_prenomsdoubles11643
DDCP MD Procès du Siècle1684
\n", "
" ], "text/plain": [ " customer_id\n", "target_name \n", "Arenametrix_bascule tel vers sib 35216\n", "Autres_interet_exposition 1021\n", "COM Inscrits NL générale (historique) 23005\n", "Contacts_prenomsdoubles 11643\n", "DDCP MD Procès du Siècle 1684" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n", "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()" ] }, { "cell_type": "markdown", "id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f", "metadata": {}, "source": [ "## Campaings area" ] }, { "cell_type": "code", "execution_count": 18, "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85", "metadata": {}, "outputs": [], "source": [ "def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n", " # campaign_stats cleaning \n", " campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n", " cleaning_date(campaign_stats, 'opened_at')\n", " cleaning_date(campaign_stats, 'sent_at')\n", " cleaning_date(campaign_stats, 'delivered_at')\n", " \n", " # campaigns cleaning\n", " campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n", " cleaning_date(campaigns, 'campaign_sent_at')\n", " \n", " # Merge \n", " campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n", " campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n", "\n", " return campaigns_full" ] }, { "cell_type": "code", "execution_count": 19, "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", "/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", "/tmp/ipykernel_3658/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n" ] } ], "source": [ "df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)" ] }, { "cell_type": "code", "execution_count": 20, "id": "c24457e7-3cad-451a-a65b-7373b656bd6e", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
019793112597NaT2021-03-28 16:01:09+00:002021-03-28 16:24:18+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
114211113666NaT2021-03-28 16:01:09+00:002021-03-28 16:21:02+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
213150280561NaT2021-03-28 16:00:59+00:002021-03-28 16:08:45+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
370731010072021-03-28 18:11:06+00:002021-03-28 16:00:59+00:002021-03-28 16:09:47+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
45175103972NaT2021-03-28 16:01:06+00:002021-03-28 16:05:03+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
\n", "
" ], "text/plain": [ " id customer_id opened_at sent_at \\\n", "0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n", "1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n", "2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n", "3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n", "4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n", "\n", " delivered_at campaign_name \\\n", "0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n", "1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n", "2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n", "3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n", "4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n", "\n", " campaign_service_id campaign_sent_at \n", "0 404 2021-03-27 23:00:00+00:00 \n", "1 404 2021-03-27 23:00:00+00:00 \n", "2 404 2021-03-27 23:00:00+00:00 \n", "3 404 2021-03-27 23:00:00+00:00 \n", "4 404 2021-03-27 23:00:00+00:00 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_campaigns_information.head()" ] }, { "cell_type": "code", "execution_count": 21, "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "metadata": {}, "outputs": [], "source": [ "def campaigns_kpi_function(campaigns_information = None):\n", " # Nombre de campagnes de mails\n", " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n", " # Temps d'ouverture en min moyen \n", " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n", " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n", "\n", " # Nombre de mail ouvert \n", " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n", " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n", " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n", "\n", " # Fusion des indicateurs\n", " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n", " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n", "\n", " # Remplir les NaN : nb_campaigns_opened\n", " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n", "\n", " # Remplir les NaT : time_to_open (??)\n", "\n", " return campaigns_reduced\n", " " ] }, { "cell_type": "code", "execution_count": 22, "id": "24537647-bc29-4777-9848-ac4120a4aa60", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3658/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n" ] } ], "source": [ "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) " ] }, { "cell_type": "code", "execution_count": 23, "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idnb_campaignsnb_campaigns_openedtime_to_open
0240.0NaT
13222124.01 days 00:28:30.169354838
2477.01 days 04:31:01.428571428
3540.0NaT
46200.0NaT
\n", "
" ], "text/plain": [ " customer_id nb_campaigns nb_campaigns_opened time_to_open\n", "0 2 4 0.0 NaT\n", "1 3 222 124.0 1 days 00:28:30.169354838\n", "2 4 7 7.0 1 days 04:31:01.428571428\n", "3 5 4 0.0 NaT\n", "4 6 20 0.0 NaT" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_campaigns_kpi.head()" ] }, { "cell_type": "markdown", "id": "56520a97-ede8-4920-a211-3b5b136af33d", "metadata": {}, "source": [ "## Create Products Table" ] }, { "cell_type": "markdown", "id": "9782e9d3-ba20-46bf-8562-bd0969972ddc", "metadata": {}, "source": [ "Some useful functions" ] }, { "cell_type": "code", "execution_count": 24, "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data\"\n", "directory_path = '1'" ] }, { "cell_type": "code", "execution_count": 25, "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c", "metadata": {}, "outputs": [], "source": [ "def display_databases(file_name):\n", " \"\"\"\n", " This function returns the file from s3 storage\n", " \"\"\"\n", " file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", " print(\"File path : \", file_path)\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", " \n", " print(\"Shape : \", df.shape)\n", " return df\n", "\n", "\n", "def remove_horodates(df):\n", " \"\"\"\n", " this function remove horodate columns like created_at and updated_at\n", " \"\"\"\n", " df = df.drop(columns = [\"created_at\", \"updated_at\"])\n", " return df\n", "\n", "\n", "def order_columns_id(df):\n", " \"\"\"\n", " this function puts all id columns at the beginning in order to read the dataset easier\n", " \"\"\"\n", " substring = 'id'\n", " id_columns = [col for col in df.columns if substring in col]\n", " remaining_col = [col for col in df.columns if substring not in col]\n", " new_order = id_columns + remaining_col\n", " return df[new_order]\n", "\n", "\n", "def process_df_2(df):\n", " \"\"\"\n", " This function organizes dataframe\n", " \"\"\"\n", " df = remove_horodates(df)\n", " print(\"Number of columns : \", len(df.columns))\n", " df = order_columns_id(df)\n", " print(\"Columns : \", df.columns)\n", " return df\n", "\n", "def load_dataset(name):\n", " \"\"\"\n", " This function loads csv file\n", " \"\"\"\n", " df = display_databases(name)\n", " df = process_df_2(df)\n", " # drop na :\n", " #df = df.dropna(axis=1, thresh=len(df))\n", " # if identifier in table : delete it\n", " if 'identifier' in df.columns:\n", " df = df.drop(columns = 'identifier')\n", " return df" ] }, { "cell_type": "markdown", "id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd", "metadata": {}, "source": [ "Create theme tables" ] }, { "cell_type": "code", "execution_count": 26, "id": "350b09b9-451f-4d47-81fe-f34b892db027", "metadata": {}, "outputs": [], "source": [ "def create_products_table():\n", " # first merge products and categories\n", " print(\"first merge products and categories\")\n", " products = load_dataset(\"1products.csv\")\n", " categories = load_dataset(\"1categories.csv\")\n", " # Drop useless columns\n", " products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n", " categories = categories.drop(columns = ['extra_field', 'quota'])\n", "\n", " #Merge\n", " products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n", " right_on = 'id', suffixes=('_products', '_categories'))\n", " products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n", " \n", " # Second merge products_theme and type of categories\n", " print(\"Second merge products_theme and type of categories\")\n", " type_of_categories = load_dataset(\"1type_of_categories.csv\")\n", " type_of_categories = type_of_categories.drop(columns = 'id')\n", " products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n", " right_on = 'category_id' )\n", "\n", " # Index cleaning\n", " products_theme = products_theme.drop(columns = ['id_categories'])\n", " products_theme = order_columns_id(products_theme)\n", " return products_theme\n", "\n", "\n", "def create_events_table():\n", " # first merge events and seasons : \n", " print(\"first merge events and seasons : \")\n", " events = load_dataset(\"1events.csv\")\n", " seasons = load_dataset(\"1seasons.csv\")\n", "\n", " # Drop useless columns\n", " events = events.drop(columns = ['manual_added', 'is_display'])\n", " seasons = seasons.drop(columns = ['start_date_time'])\n", " \n", " events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n", "\n", " # Secondly merge events_theme and event_types\n", " print(\"Secondly merge events_theme and event_types : \")\n", " event_types = load_dataset(\"1event_types.csv\")\n", " event_types = event_types.drop(columns = ['fidelity_delay'])\n", " \n", " events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n", " events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n", " events_theme = events_theme.drop(columns = 'id')\n", "\n", " # thirdly merge events_theme and facilities\n", " print(\"thirdly merge events_theme and facilities : \")\n", " facilities = load_dataset(\"1facilities.csv\")\n", " facilities = facilities.drop(columns = ['fixed_capacity'])\n", " \n", " events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n", " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n", " events_theme = events_theme.drop(columns = 'id')\n", "\n", " # Index cleaning\n", " events_theme = events_theme.drop(columns = ['id_seasons'])\n", " events_theme = order_columns_id(events_theme)\n", " return events_theme\n", "\n", "\n", "def create_representations_table():\n", " representations = load_dataset(\"1representations.csv\")\n", " representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n", " 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n", " 'representation_type_id'])\n", " \n", " representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n", " representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n", "\n", " representations_theme = representations.merge(representations_capacity, how='left',\n", " left_on='id', right_on='representation_id',\n", " suffixes=('_representation', '_representation_cap'))\n", " # index cleaning\n", " representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n", " representations_theme = order_columns_id(representations_theme)\n", " return representations_theme" ] }, { "cell_type": "code", "execution_count": 27, "id": "0fccc8ef-e575-4857-a401-94a7274394df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "first merge products and categories\n", "File path : bdc2324-data/1/1products.csv\n", "Shape : (94803, 14)\n", "Number of columns : 12\n", "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n", " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n", " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n", " dtype='object')\n", "File path : bdc2324-data/1/1categories.csv\n", "Shape : (27, 7)\n", "Number of columns : 5\n", "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n", "Second merge products_theme and type of categories\n", "File path : bdc2324-data/1/1type_of_categories.csv\n", "Shape : (5, 6)\n", "Number of columns : 4\n", "Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_productsrepresentation_idpricing_formula_idcategory_idproducts_group_idproduct_pack_idtype_of_idamountis_full_pricename_categories
01068291411441106551NaN9.0Falseindiv activité tr
14782731311471112.09.5Falseindiv entrées tp
220873275137120825112.011.5Falseindiv entrées tp
315714282519951567731NaN8.0Falseindiv entrées tr
4134199311175112.08.5Falseindiv entrées tp
\n", "
" ], "text/plain": [ " id_products representation_id pricing_formula_id category_id \\\n", "0 10682 914 114 41 \n", "1 478 273 131 1 \n", "2 20873 275 137 1 \n", "3 157142 82519 9 5 \n", "4 1341 9 93 1 \n", "\n", " products_group_id product_pack_id type_of_id amount is_full_price \\\n", "0 10655 1 NaN 9.0 False \n", "1 471 1 12.0 9.5 False \n", "2 20825 1 12.0 11.5 False \n", "3 156773 1 NaN 8.0 False \n", "4 1175 1 12.0 8.5 False \n", "\n", " name_categories \n", "0 indiv activité tr \n", "1 indiv entrées tp \n", "2 indiv entrées tp \n", "3 indiv entrées tr \n", "4 indiv entrées tp " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "products_theme = create_products_table()\n", "products_theme.head()" ] }, { "cell_type": "code", "execution_count": 28, "id": "779d8aaf-6668-4f66-8852-847304407ea3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "first merge events and seasons : \n", "File path : bdc2324-data/1/1events.csv\n", "Shape : (1232, 12)\n", "Number of columns : 10\n", "Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n", " 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n", " dtype='object')\n", "File path : bdc2324-data/1/1seasons.csv\n", "Shape : (13, 6)\n", "Number of columns : 4\n", "Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n", "Secondly merge events_theme and event_types : \n", "File path : bdc2324-data/1/1event_types.csv\n", "Shape : (9, 6)\n", "Number of columns : 4\n", "Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n", "thirdly merge events_theme and facilities : \n", "File path : bdc2324-data/1/1facilities.csv\n", "Shape : (2, 7)\n", "Number of columns : 5\n", "Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_idseason_idfacility_idevent_type_idevent_type_key_idfacility_key_idstreet_idname_eventsname_seasonsname_event_typesname_facilities
01921614411frontières2018spectacle vivantmucem
130329276715511visite guidée une autre histoire du monde (1h00)2023offre muséale groupemucem
21611612211visite contée les chercheurs d'or indiv2018offre muséale individuelmucem
3595758214411we dreamt of utopia and we woke up screaming.2021spectacle vivantmucem
4833758214411jeff koons épisodes 42021spectacle vivantmucem
\n", "
" ], "text/plain": [ " event_id season_id facility_id event_type_id event_type_key_id \\\n", "0 192 16 1 4 4 \n", "1 30329 2767 1 5 5 \n", "2 161 16 1 2 2 \n", "3 5957 582 1 4 4 \n", "4 8337 582 1 4 4 \n", "\n", " facility_key_id street_id \\\n", "0 1 1 \n", "1 1 1 \n", "2 1 1 \n", "3 1 1 \n", "4 1 1 \n", "\n", " name_events name_seasons \\\n", "0 frontières 2018 \n", "1 visite guidée une autre histoire du monde (1h00) 2023 \n", "2 visite contée les chercheurs d'or indiv 2018 \n", "3 we dreamt of utopia and we woke up screaming. 2021 \n", "4 jeff koons épisodes 4 2021 \n", "\n", " name_event_types name_facilities \n", "0 spectacle vivant mucem \n", "1 offre muséale groupe mucem \n", "2 offre muséale individuel mucem \n", "3 spectacle vivant mucem \n", "4 spectacle vivant mucem " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "events_theme= create_events_table()\n", "events_theme.head()" ] }, { "cell_type": "code", "execution_count": 29, "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : bdc2324-data/1/1representations.csv\n", "Shape : (36095, 16)\n", "Number of columns : 14\n", "Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n", " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n", " 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n", " dtype='object')\n", "File path : bdc2324-data/1/1representation_category_capacities.csv\n", "Shape : (65241, 7)\n", "Number of columns : 5\n", "Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n", " 'max_filling'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_idid_representation_caprepresentation_idcategory_id
012384123058848202
13725142692
2373842695
337251526910
4373832691
\n", "
" ], "text/plain": [ " event_id id_representation_cap representation_id category_id\n", "0 12384 123058 84820 2\n", "1 37 2514 269 2\n", "2 37 384 269 5\n", "3 37 2515 269 10\n", "4 37 383 269 1" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "representation_theme = create_representations_table()\n", "representation_theme.head()" ] }, { "cell_type": "markdown", "id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a", "metadata": {}, "source": [ "Create uniform product database " ] }, { "cell_type": "code", "execution_count": 30, "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba", "metadata": {}, "outputs": [], "source": [ "def uniform_product_df():\n", " \"\"\"\n", " This function returns the uniform product dataset\n", " \"\"\"\n", " print(\"Products theme columns : \", products_theme.columns)\n", " print(\"\\n Representation theme columns : \", representation_theme.columns)\n", " print(\"\\n Events theme columns : \", events_theme.columns)\n", "\n", " products_global = products_theme.merge(representation_theme, how='left',\n", " on= [\"representation_id\", \"category_id\"])\n", " \n", " products_global = products_global.merge(events_theme, how='left', on='event_id',\n", " suffixes = (\"_representation\", \"_event\"))\n", " \n", " products_global = order_columns_id(products_global)\n", "\n", " # remove useless columns \n", " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n", " return products_global" ] }, { "cell_type": "code", "execution_count": 31, "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n", " 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n", " 'is_full_price', 'name_categories'],\n", " dtype='object')\n", "\n", " Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n", " 'category_id'],\n", " dtype='object')\n", "\n", " Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n", " 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n", " 'name_seasons', 'name_event_types', 'name_facilities'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_productsrepresentation_idpricing_formula_idcategory_idproducts_group_idproduct_pack_idevent_idid_representation_capseason_idfacility_idevent_type_idevent_type_key_idfacility_key_idstreet_idamountis_full_pricename_event_typesname_facilities
0106829141144110655113287894125119.0Falseoffre muséale individuelmucem
147827313114711373902122119.5Falseoffre muséale individuelmucem
22087327513712082513739521221111.5Falseoffre muséale individuelmucem
315714282519951567731123651201991754124118.0Falseoffre muséale individuelmucem
413419931117518214136118.5Falsenon définimucem
\n", "
" ], "text/plain": [ " id_products representation_id pricing_formula_id category_id \\\n", "0 10682 914 114 41 \n", "1 478 273 131 1 \n", "2 20873 275 137 1 \n", "3 157142 82519 9 5 \n", "4 1341 9 93 1 \n", "\n", " products_group_id product_pack_id event_id id_representation_cap \\\n", "0 10655 1 132 8789 \n", "1 471 1 37 390 \n", "2 20825 1 37 395 \n", "3 156773 1 12365 120199 \n", "4 1175 1 8 21 \n", "\n", " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n", "0 4 1 2 5 1 \n", "1 2 1 2 2 1 \n", "2 2 1 2 2 1 \n", "3 1754 1 2 4 1 \n", "4 4 1 3 6 1 \n", "\n", " street_id amount is_full_price name_event_types name_facilities \n", "0 1 9.0 False offre muséale individuel mucem \n", "1 1 9.5 False offre muséale individuel mucem \n", "2 1 11.5 False offre muséale individuel mucem \n", "3 1 8.0 False offre muséale individuel mucem \n", "4 1 8.5 False non défini mucem " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "products_global = uniform_product_df()\n", "products_global.head()" ] }, { "cell_type": "markdown", "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7", "metadata": {}, "source": [ "# Fusion des bases locales" ] }, { "cell_type": "code", "execution_count": 32, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], "source": [ "# Fusion liée au product\n", "df1_products_purchased = pd.merge(df1_tickets_kpi, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n", "\n", "# Fusion liée au customer\n", "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", "\n", "# Fusion product et customer\n", "df1_customer_product = pd.merge(df1_customer, df1_products_purchased, on = 'customer_id', how = 'left')" ] }, { "cell_type": "code", "execution_count": null, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }