BDC-team-1/0_Cleaning_and_merge.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
   "metadata": {},
   "source": [
    "# Business Data Challenge - Team 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "15103481-8d74-404c-aa09-7601fe7730da",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
   "metadata": {},
   "source": [
    "Configuration de l'accès aux données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
   "metadata": {},
   "source": [
    "# Exemple sur Company 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db26e59a-927c-407e-b54b-1815473b0b34",
   "metadata": {},
   "source": [
    "## Chargement données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "699664b9-eee4-4f8d-a207-e524526560c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data/1\"\n",
    "liste_database = fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bdc2324-data/1/1campaign_stats.csv',\n",
       " 'bdc2324-data/1/1campaigns.csv',\n",
       " 'bdc2324-data/1/1categories.csv',\n",
       " 'bdc2324-data/1/1countries.csv',\n",
       " 'bdc2324-data/1/1currencies.csv',\n",
       " 'bdc2324-data/1/1customer_target_mappings.csv',\n",
       " 'bdc2324-data/1/1customersplus.csv',\n",
       " 'bdc2324-data/1/1event_types.csv',\n",
       " 'bdc2324-data/1/1events.csv',\n",
       " 'bdc2324-data/1/1facilities.csv',\n",
       " 'bdc2324-data/1/1link_stats.csv',\n",
       " 'bdc2324-data/1/1pricing_formulas.csv',\n",
       " 'bdc2324-data/1/1product_packs.csv',\n",
       " 'bdc2324-data/1/1products.csv',\n",
       " 'bdc2324-data/1/1products_groups.csv',\n",
       " 'bdc2324-data/1/1purchases.csv',\n",
       " 'bdc2324-data/1/1representation_category_capacities.csv',\n",
       " 'bdc2324-data/1/1representations.csv',\n",
       " 'bdc2324-data/1/1seasons.csv',\n",
       " 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
       " 'bdc2324-data/1/1suppliers.csv',\n",
       " 'bdc2324-data/1/1tags.csv',\n",
       " 'bdc2324-data/1/1target_types.csv',\n",
       " 'bdc2324-data/1/1targets.csv',\n",
       " 'bdc2324-data/1/1tickets.csv',\n",
       " 'bdc2324-data/1/1type_of_categories.csv',\n",
       " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
       " 'bdc2324-data/1/1type_ofs.csv']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "liste_database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2240/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n"
     ]
    }
   ],
   "source": [
    "# loop to create dataframes from liste\n",
    "files_path = liste_database\n",
    "\n",
    "client_number = files_path[0].split(\"/\")[1]\n",
    "df_prefix = \"df\" + str(client_number) + \"_\"\n",
    "\n",
    "for i in range(len(files_path)) :\n",
    "    current_path = files_path[i]\n",
    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in)\n",
    "        # the pattern of the name is df1xxx\n",
    "        nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
    "        globals()[nom_dataframe] = df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
   "metadata": {},
   "source": [
    "## Cleaning functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_date(df, column_name):\n",
    "    \"\"\"\n",
    "    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
    "\n",
    "    Parameters:\n",
    "    - df: DataFrame\n",
    "        Le DataFrame contenant la colonne à nettoyer.\n",
    "    - column_name: str\n",
    "        Le nom de la colonne à nettoyer.\n",
    "\n",
    "    Returns:\n",
    "    - DataFrame\n",
    "        Le DataFrame modifié avec la colonne nettoyée.\n",
    "    \"\"\"\n",
    "    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "398804d8-2225-4fd3-bceb-75ab1588e359",
   "metadata": {},
   "source": [
    "## Preprocessing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
   "metadata": {},
   "source": [
    "## customer_plus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocessing_customerplus(customerplus = None):\n",
    "\n",
    "    customerplus_copy = customerplus.copy()\n",
    "    \n",
    "    # Passage en format date\n",
    "    cleaning_date(customerplus_copy, 'first_buying_date')\n",
    "    cleaning_date(customerplus_copy, 'last_visiting_date')\n",
    "    \n",
    "    # Selection des variables\n",
    "    customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
    "    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
    "\n",
    "    return customerplus_copy\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
   "metadata": {},
   "source": [
    "## Ticket area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b95464b1-26bc-4aac-84b4-45da83b92251",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fonction de nettoyage et selection\n",
    "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
    "    # Base des tickets\n",
    "    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
    "    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
    "\n",
    "    # Base des fournisseurs\n",
    "    suppliers = suppliers[['id', 'name']]\n",
    "    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
    "\n",
    "    # Base des types de billets\n",
    "    type_ofs = type_ofs[['id', 'name', 'children']]\n",
    "    type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
    "\n",
    "    # Base des achats\n",
    "    # Nettoyage de la date d'achat\n",
    "    cleaning_date(purchases, 'purchase_date')\n",
    "    # Selection des variables\n",
    "    purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
    "\n",
    "    # Fusions \n",
    "    # Fusion avec fournisseurs\n",
    "    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
    "    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
    "    \n",
    "    # Fusion avec type de tickets\n",
    "    ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
    "    ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
    "    \n",
    "    # Fusion avec achats\n",
    "    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
    "    ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
    "\n",
    "    return ticket_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2240/1591303091.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
      "/tmp/ipykernel_2240/1591303091.py:9: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
      "/tmp/ipykernel_2240/1591303091.py:13: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
     ]
    }
   ],
   "source": [
    "df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ticket_id</th>\n",
       "      <th>product_id</th>\n",
       "      <th>is_from_subscription</th>\n",
       "      <th>supplier_name</th>\n",
       "      <th>type_of_ticket_name</th>\n",
       "      <th>children</th>\n",
       "      <th>purchase_date</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>13070859</td>\n",
       "      <td>225251</td>\n",
       "      <td>False</td>\n",
       "      <td>vente en ligne</td>\n",
       "      <td>Atelier</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2018-12-28 14:47:50+00:00</td>\n",
       "      <td>48187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13070860</td>\n",
       "      <td>224914</td>\n",
       "      <td>False</td>\n",
       "      <td>vente en ligne</td>\n",
       "      <td>Atelier</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2018-12-28 14:47:50+00:00</td>\n",
       "      <td>48187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13070861</td>\n",
       "      <td>224914</td>\n",
       "      <td>False</td>\n",
       "      <td>vente en ligne</td>\n",
       "      <td>Atelier</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2018-12-28 14:47:50+00:00</td>\n",
       "      <td>48187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13070862</td>\n",
       "      <td>224914</td>\n",
       "      <td>False</td>\n",
       "      <td>vente en ligne</td>\n",
       "      <td>Atelier</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2018-12-28 14:47:50+00:00</td>\n",
       "      <td>48187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13070863</td>\n",
       "      <td>224914</td>\n",
       "      <td>False</td>\n",
       "      <td>vente en ligne</td>\n",
       "      <td>Atelier</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2018-12-28 14:47:50+00:00</td>\n",
       "      <td>48187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826667</th>\n",
       "      <td>18643847</td>\n",
       "      <td>350454</td>\n",
       "      <td>False</td>\n",
       "      <td>vad</td>\n",
       "      <td>Billet en nombre</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2022-08-02 08:59:17+00:00</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826668</th>\n",
       "      <td>19853111</td>\n",
       "      <td>383564</td>\n",
       "      <td>False</td>\n",
       "      <td>vad</td>\n",
       "      <td>Billet en nombre</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2022-11-04 14:25:42+00:00</td>\n",
       "      <td>62763</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826669</th>\n",
       "      <td>19860514</td>\n",
       "      <td>383751</td>\n",
       "      <td>False</td>\n",
       "      <td>vad</td>\n",
       "      <td>Billet en nombre</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2022-11-18 10:47:26+00:00</td>\n",
       "      <td>1195566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826670</th>\n",
       "      <td>19860515</td>\n",
       "      <td>383751</td>\n",
       "      <td>False</td>\n",
       "      <td>vad</td>\n",
       "      <td>Billet en nombre</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2022-11-18 10:47:26+00:00</td>\n",
       "      <td>1195566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826671</th>\n",
       "      <td>19860516</td>\n",
       "      <td>383751</td>\n",
       "      <td>False</td>\n",
       "      <td>vad</td>\n",
       "      <td>Billet en nombre</td>\n",
       "      <td>pricing_formula</td>\n",
       "      <td>2022-11-18 10:47:26+00:00</td>\n",
       "      <td>1195566</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1826672 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         ticket_id  product_id  is_from_subscription   supplier_name  \\\n",
       "0         13070859      225251                 False  vente en ligne   \n",
       "1         13070860      224914                 False  vente en ligne   \n",
       "2         13070861      224914                 False  vente en ligne   \n",
       "3         13070862      224914                 False  vente en ligne   \n",
       "4         13070863      224914                 False  vente en ligne   \n",
       "...            ...         ...                   ...             ...   \n",
       "1826667   18643847      350454                 False             vad   \n",
       "1826668   19853111      383564                 False             vad   \n",
       "1826669   19860514      383751                 False             vad   \n",
       "1826670   19860515      383751                 False             vad   \n",
       "1826671   19860516      383751                 False             vad   \n",
       "\n",
       "        type_of_ticket_name         children             purchase_date  \\\n",
       "0                   Atelier  pricing_formula 2018-12-28 14:47:50+00:00   \n",
       "1                   Atelier  pricing_formula 2018-12-28 14:47:50+00:00   \n",
       "2                   Atelier  pricing_formula 2018-12-28 14:47:50+00:00   \n",
       "3                   Atelier  pricing_formula 2018-12-28 14:47:50+00:00   \n",
       "4                   Atelier  pricing_formula 2018-12-28 14:47:50+00:00   \n",
       "...                     ...              ...                       ...   \n",
       "1826667    Billet en nombre  pricing_formula 2022-08-02 08:59:17+00:00   \n",
       "1826668    Billet en nombre  pricing_formula 2022-11-04 14:25:42+00:00   \n",
       "1826669    Billet en nombre  pricing_formula 2022-11-18 10:47:26+00:00   \n",
       "1826670    Billet en nombre  pricing_formula 2022-11-18 10:47:26+00:00   \n",
       "1826671    Billet en nombre  pricing_formula 2022-11-18 10:47:26+00:00   \n",
       "\n",
       "         customer_id  \n",
       "0              48187  \n",
       "1              48187  \n",
       "2              48187  \n",
       "3              48187  \n",
       "4              48187  \n",
       "...              ...  \n",
       "1826667           41  \n",
       "1826668        62763  \n",
       "1826669      1195566  \n",
       "1826670      1195566  \n",
       "1826671      1195566  \n",
       "\n",
       "[1826672 rows x 8 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_ticket_information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37499eae-1a7f-4dce-83b0-ff942ccf7a9d",
   "metadata": {},
   "source": [
    "### KPI tickets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "043303fe-e90f-4689-a2a9-5d690555a045",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tickets_kpi_function(tickets_information = None):\n",
    "    tickets_information_copy = tickets_information.copy()\n",
    "    tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n",
    "    tickets_kpi = (tickets_information_copy[['product_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max']]\n",
    "                   .groupby(['product_id', 'customer_id'])\n",
    "                   .agg({'ticket_id': 'count', \n",
    "                         'supplier_name': 'nunique',\n",
    "                         'purchase_date_max' : 'max',\n",
    "                         'purchase_date' : 'min'})\n",
    "                   .reset_index()\n",
    "                  )\n",
    "    \n",
    "    tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n",
    "                                  'supplier_name' : 'nb_suppliers', \n",
    "                                  'purchase_date' : 'purchase_date_min'}, inplace = True)\n",
    "    \n",
    "    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
    "    \n",
    "    return tickets_kpi\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5882234a-1ed5-4269-87a6-0d75613476e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_ticket_information)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>product_id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>nb_tickets</th>\n",
       "      <th>nb_suppliers</th>\n",
       "      <th>purchase_date_max</th>\n",
       "      <th>purchase_date_min</th>\n",
       "      <th>time_between_purchase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>107310</td>\n",
       "      <td>2805</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2019-06-05 14:37:13+00:00</td>\n",
       "      <td>2019-06-05 14:18:38+00:00</td>\n",
       "      <td>0 days 00:18:35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>110089</td>\n",
       "      <td>54355</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2017-02-17 13:32:51+00:00</td>\n",
       "      <td>2017-02-17 13:32:51+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>110089</td>\n",
       "      <td>54356</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2017-03-02 14:36:16+00:00</td>\n",
       "      <td>2017-03-02 14:36:16+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>110089</td>\n",
       "      <td>54357</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2017-03-06 15:16:41+00:00</td>\n",
       "      <td>2017-03-06 15:16:41+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110089</td>\n",
       "      <td>54358</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2017-03-13 16:07:27+00:00</td>\n",
       "      <td>2017-03-13 16:07:27+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128360</th>\n",
       "      <td>406026</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>2023-11-08 12:53:31+00:00</td>\n",
       "      <td>2023-11-08 09:30:28+00:00</td>\n",
       "      <td>0 days 03:23:03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128361</th>\n",
       "      <td>406027</td>\n",
       "      <td>1</td>\n",
       "      <td>31</td>\n",
       "      <td>2</td>\n",
       "      <td>2023-11-08 15:59:11+00:00</td>\n",
       "      <td>2023-11-08 09:15:36+00:00</td>\n",
       "      <td>0 days 06:43:35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128362</th>\n",
       "      <td>406028</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2023-11-08 14:56:08+00:00</td>\n",
       "      <td>2023-11-08 11:18:37+00:00</td>\n",
       "      <td>0 days 03:37:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128363</th>\n",
       "      <td>406029</td>\n",
       "      <td>1256130</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2023-11-08 10:35:43+00:00</td>\n",
       "      <td>2023-11-08 10:35:43+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128364</th>\n",
       "      <td>406029</td>\n",
       "      <td>1256133</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2023-11-08 16:51:19+00:00</td>\n",
       "      <td>2023-11-08 16:51:19+00:00</td>\n",
       "      <td>0 days 00:00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>128365 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        product_id  customer_id  nb_tickets  nb_suppliers  \\\n",
       "0           107310         2805           4             2   \n",
       "1           110089        54355           1             1   \n",
       "2           110089        54356           1             1   \n",
       "3           110089        54357           1             1   \n",
       "4           110089        54358           1             1   \n",
       "...            ...          ...         ...           ...   \n",
       "128360      406026            1          11             2   \n",
       "128361      406027            1          31             2   \n",
       "128362      406028            1           2             1   \n",
       "128363      406029      1256130           2             1   \n",
       "128364      406029      1256133           3             1   \n",
       "\n",
       "               purchase_date_max         purchase_date_min  \\\n",
       "0      2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00   \n",
       "1      2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00   \n",
       "2      2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00   \n",
       "3      2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00   \n",
       "4      2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00   \n",
       "...                          ...                       ...   \n",
       "128360 2023-11-08 12:53:31+00:00 2023-11-08 09:30:28+00:00   \n",
       "128361 2023-11-08 15:59:11+00:00 2023-11-08 09:15:36+00:00   \n",
       "128362 2023-11-08 14:56:08+00:00 2023-11-08 11:18:37+00:00   \n",
       "128363 2023-11-08 10:35:43+00:00 2023-11-08 10:35:43+00:00   \n",
       "128364 2023-11-08 16:51:19+00:00 2023-11-08 16:51:19+00:00   \n",
       "\n",
       "       time_between_purchase  \n",
       "0            0 days 00:18:35  \n",
       "1            0 days 00:00:00  \n",
       "2            0 days 00:00:00  \n",
       "3            0 days 00:00:00  \n",
       "4            0 days 00:00:00  \n",
       "...                      ...  \n",
       "128360       0 days 03:23:03  \n",
       "128361       0 days 06:43:35  \n",
       "128362       0 days 03:37:31  \n",
       "128363       0 days 00:00:00  \n",
       "128364       0 days 00:00:00  \n",
       "\n",
       "[128365 rows x 7 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_tickets_kpi"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "096e47f4-1d65-4575-989d-83227eedad2b",
   "metadata": {},
   "source": [
    "## Target area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
    "    # Target.csv cleaning\n",
    "    targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
    "    targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
    "    \n",
    "    # target_type cleaning\n",
    "    target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
    "    \n",
    "    #customer_target_mappings cleaning\n",
    "    customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
    "    \n",
    "    # Merge target et target_type\n",
    "    targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
    "    targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
    "    \n",
    "    # Merge\n",
    "    targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
    "    targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
    "\n",
    "    return targets_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2240/3848597476.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
     ]
    }
   ],
   "source": [
    "df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target_name</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>consentement optin mediation specialisee</th>\n",
       "      <td>150000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin jeune public</th>\n",
       "      <td>149979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin b2c</th>\n",
       "      <td>108909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Arenametrix_bascule tel vers sib</th>\n",
       "      <td>35216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout b2c</th>\n",
       "      <td>34523</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Automation_parrainage_newsletter_handicap_visuel</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout mediation specialisee</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Inscrits NL LSF formulaire</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Market auto - contacts inactifs post-scénario</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Inactifs - fin du scénario</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>283 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  customer_id\n",
       "target_name                                                  \n",
       "consentement optin mediation specialisee               150000\n",
       "consentement optin jeune public                        149979\n",
       "consentement optin b2c                                 108909\n",
       "Arenametrix_bascule tel vers sib                        35216\n",
       "consentement optout b2c                                 34523\n",
       "...                                                       ...\n",
       "Automation_parrainage_newsletter_handicap_visuel            1\n",
       "consentement optout mediation specialisee                   1\n",
       "Inscrits NL LSF formulaire                                  1\n",
       "Market auto - contacts inactifs post-scénario               1\n",
       "Inactifs - fin du scénario                                  1\n",
       "\n",
       "[283 rows x 1 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target_name</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Arenametrix_bascule tel vers sib</th>\n",
       "      <td>35216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Autres_interet_exposition</th>\n",
       "      <td>1021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COM Inscrits NL générale (historique)</th>\n",
       "      <td>23005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Contacts_prenomsdoubles</th>\n",
       "      <td>11643</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP MD Procès du Siècle</th>\n",
       "      <td>1684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP Newsletter centres de loisirs</th>\n",
       "      <td>1032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP Newsletter enseignants</th>\n",
       "      <td>4510</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP Newsletter jeune public</th>\n",
       "      <td>3862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP Newsletter relais champ social</th>\n",
       "      <td>2270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP PROMO Participants ateliers (adultes et enfants)</th>\n",
       "      <td>1954</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP billets famille</th>\n",
       "      <td>3609</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP promo MD pass musées dps oct 2018</th>\n",
       "      <td>1785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP promo Plan B 2019 (concerts)</th>\n",
       "      <td>1948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers)</th>\n",
       "      <td>1293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP rentrée culturelle 2023</th>\n",
       "      <td>1757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DDCP_marseille_jazz_2023</th>\n",
       "      <td>1043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DRE Festival Jean Rouch</th>\n",
       "      <td>1502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DRE MucemLab</th>\n",
       "      <td>2302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DRE chercheurs</th>\n",
       "      <td>1557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DRE institutionnels</th>\n",
       "      <td>2229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FORMATION _ acheteurs optin last year</th>\n",
       "      <td>10485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Inscrits NL générale (export_291019 + operation_videomaton)</th>\n",
       "      <td>14086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Inscrits NL générale site web</th>\n",
       "      <td>3732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Inscrits NL jeune public site web</th>\n",
       "      <td>1249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Votre première liste</th>\n",
       "      <td>3715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin b2b</th>\n",
       "      <td>12735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin b2c</th>\n",
       "      <td>108909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin dre</th>\n",
       "      <td>4527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin jeune public</th>\n",
       "      <td>149979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin mediation specialisee</th>\n",
       "      <td>150000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin newsletter generale</th>\n",
       "      <td>22095</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optin scolaires</th>\n",
       "      <td>4849</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout b2b</th>\n",
       "      <td>14219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout b2c</th>\n",
       "      <td>34523</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout dre</th>\n",
       "      <td>14328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout newsletter generale</th>\n",
       "      <td>18855</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consentement optout scolaires</th>\n",
       "      <td>15744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddcp_md_scene_ouverte_au_talent</th>\n",
       "      <td>1577</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddcp_promo_MD_billet_musée_oct_2019_agarder2</th>\n",
       "      <td>5482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddcp_promo_md_musée_dps 011019</th>\n",
       "      <td>6010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddcp_promo_visiteurs occasionnels_musee_8mois</th>\n",
       "      <td>6640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddcp_visiteurs dps 010622</th>\n",
       "      <td>12355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>festival_jean_rouch</th>\n",
       "      <td>1502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rappel po barvalo</th>\n",
       "      <td>1248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>structures_etiquette champ social</th>\n",
       "      <td>1488</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    customer_id\n",
       "target_name                                                    \n",
       "Arenametrix_bascule tel vers sib                          35216\n",
       "Autres_interet_exposition                                  1021\n",
       "COM Inscrits NL générale (historique)                     23005\n",
       "Contacts_prenomsdoubles                                   11643\n",
       "DDCP MD Procès du Siècle                                   1684\n",
       "DDCP Newsletter centres de loisirs                         1032\n",
       "DDCP Newsletter enseignants                                4510\n",
       "DDCP Newsletter jeune public                               3862\n",
       "DDCP Newsletter relais champ social                        2270\n",
       "DDCP PROMO Participants ateliers (adultes et en...         1954\n",
       "DDCP billets famille                                       3609\n",
       "DDCP promo MD pass musées dps oct 2018                     1785\n",
       "DDCP promo Plan B 2019 (concerts)                          1948\n",
       "DDCP promo spectateurs prog 21-22 (spectacles, ...         1293\n",
       "DDCP rentrée culturelle 2023                               1757\n",
       "DDCP_marseille_jazz_2023                                   1043\n",
       "DRE Festival Jean Rouch                                    1502\n",
       "DRE MucemLab                                               2302\n",
       "DRE chercheurs                                             1557\n",
       "DRE institutionnels                                        2229\n",
       "FORMATION _ acheteurs optin last year                     10485\n",
       "Inscrits NL générale (export_291019 + operation...        14086\n",
       "Inscrits NL générale site web                              3732\n",
       "Inscrits NL jeune public site web                          1249\n",
       "Votre première liste                                       3715\n",
       "consentement optin b2b                                    12735\n",
       "consentement optin b2c                                   108909\n",
       "consentement optin dre                                     4527\n",
       "consentement optin jeune public                          149979\n",
       "consentement optin mediation specialisee                 150000\n",
       "consentement optin newsletter generale                    22095\n",
       "consentement optin scolaires                               4849\n",
       "consentement optout b2b                                   14219\n",
       "consentement optout b2c                                   34523\n",
       "consentement optout dre                                   14328\n",
       "consentement optout newsletter generale                   18855\n",
       "consentement optout scolaires                             15744\n",
       "ddcp_md_scene_ouverte_au_talent                            1577\n",
       "ddcp_promo_MD_billet_musée_oct_2019_agarder2               5482\n",
       "ddcp_promo_md_musée_dps 011019                             6010\n",
       "ddcp_promo_visiteurs occasionnels_musee_8mois              6640\n",
       "ddcp_visiteurs dps 010622                                 12355\n",
       "festival_jean_rouch                                        1502\n",
       "rappel po barvalo                                          1248\n",
       "structures_etiquette champ social                          1488"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
    "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
   "metadata": {},
   "source": [
    "## Campaings area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
    "    # campaign_stats cleaning \n",
    "    campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
    "    cleaning_date(campaign_stats, 'opened_at')\n",
    "    cleaning_date(campaign_stats, 'sent_at')\n",
    "    cleaning_date(campaign_stats, 'delivered_at')\n",
    "    \n",
    "    # campaigns cleaning\n",
    "    campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
    "    cleaning_date(campaigns, 'campaign_sent_at')\n",
    "    \n",
    "    # Merge \n",
    "    campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
    "    campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
    "\n",
    "    return campaigns_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2240/1967867975.py:15: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
      "/tmp/ipykernel_2240/1967867975.py:15: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
      "/tmp/ipykernel_2240/1967867975.py:15: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
     ]
    }
   ],
   "source": [
    "df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>opened_at</th>\n",
       "      <th>sent_at</th>\n",
       "      <th>delivered_at</th>\n",
       "      <th>campaign_name</th>\n",
       "      <th>campaign_service_id</th>\n",
       "      <th>campaign_sent_at</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19793</td>\n",
       "      <td>112597</td>\n",
       "      <td>NaT</td>\n",
       "      <td>2021-03-28 16:01:09+00:00</td>\n",
       "      <td>2021-03-28 16:24:18+00:00</td>\n",
       "      <td>Le Mucem chez vous, gardons le lien #22</td>\n",
       "      <td>404</td>\n",
       "      <td>2021-03-27 23:00:00+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14211</td>\n",
       "      <td>113666</td>\n",
       "      <td>NaT</td>\n",
       "      <td>2021-03-28 16:01:09+00:00</td>\n",
       "      <td>2021-03-28 16:21:02+00:00</td>\n",
       "      <td>Le Mucem chez vous, gardons le lien #22</td>\n",
       "      <td>404</td>\n",
       "      <td>2021-03-27 23:00:00+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13150</td>\n",
       "      <td>280561</td>\n",
       "      <td>NaT</td>\n",
       "      <td>2021-03-28 16:00:59+00:00</td>\n",
       "      <td>2021-03-28 16:08:45+00:00</td>\n",
       "      <td>Le Mucem chez vous, gardons le lien #22</td>\n",
       "      <td>404</td>\n",
       "      <td>2021-03-27 23:00:00+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7073</td>\n",
       "      <td>101007</td>\n",
       "      <td>2021-03-28 18:11:06+00:00</td>\n",
       "      <td>2021-03-28 16:00:59+00:00</td>\n",
       "      <td>2021-03-28 16:09:47+00:00</td>\n",
       "      <td>Le Mucem chez vous, gardons le lien #22</td>\n",
       "      <td>404</td>\n",
       "      <td>2021-03-27 23:00:00+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5175</td>\n",
       "      <td>103972</td>\n",
       "      <td>NaT</td>\n",
       "      <td>2021-03-28 16:01:06+00:00</td>\n",
       "      <td>2021-03-28 16:05:03+00:00</td>\n",
       "      <td>Le Mucem chez vous, gardons le lien #22</td>\n",
       "      <td>404</td>\n",
       "      <td>2021-03-27 23:00:00+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6214803</th>\n",
       "      <td>8302994</td>\n",
       "      <td>266155</td>\n",
       "      <td>2023-10-23 09:43:25+00:00</td>\n",
       "      <td>2023-10-23 09:32:33+00:00</td>\n",
       "      <td>2023-10-23 09:32:34+00:00</td>\n",
       "      <td>dre_nov_2023</td>\n",
       "      <td>1318</td>\n",
       "      <td>2023-10-23 09:31:17+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6214804</th>\n",
       "      <td>8303307</td>\n",
       "      <td>21355</td>\n",
       "      <td>2023-10-23 09:44:02+00:00</td>\n",
       "      <td>2023-10-23 09:32:49+00:00</td>\n",
       "      <td>2023-10-23 09:32:49+00:00</td>\n",
       "      <td>dre_nov_2023</td>\n",
       "      <td>1318</td>\n",
       "      <td>2023-10-23 09:31:17+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6214805</th>\n",
       "      <td>8304346</td>\n",
       "      <td>21849</td>\n",
       "      <td>2023-10-23 09:45:52+00:00</td>\n",
       "      <td>2023-10-23 09:33:28+00:00</td>\n",
       "      <td>2023-10-23 09:33:29+00:00</td>\n",
       "      <td>dre_nov_2023</td>\n",
       "      <td>1318</td>\n",
       "      <td>2023-10-23 09:31:17+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6214806</th>\n",
       "      <td>8302037</td>\n",
       "      <td>667789</td>\n",
       "      <td>2023-10-23 09:47:32+00:00</td>\n",
       "      <td>2023-10-23 09:31:53+00:00</td>\n",
       "      <td>2023-10-23 09:31:54+00:00</td>\n",
       "      <td>dre_nov_2023</td>\n",
       "      <td>1318</td>\n",
       "      <td>2023-10-23 09:31:17+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6214807</th>\n",
       "      <td>8304939</td>\n",
       "      <td>294154</td>\n",
       "      <td>NaT</td>\n",
       "      <td>2023-10-23 09:33:54+00:00</td>\n",
       "      <td>2023-10-23 09:33:55+00:00</td>\n",
       "      <td>dre_nov_2023</td>\n",
       "      <td>1318</td>\n",
       "      <td>2023-10-23 09:31:17+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6214808 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id  customer_id                 opened_at  \\\n",
       "0          19793       112597                       NaT   \n",
       "1          14211       113666                       NaT   \n",
       "2          13150       280561                       NaT   \n",
       "3           7073       101007 2021-03-28 18:11:06+00:00   \n",
       "4           5175       103972                       NaT   \n",
       "...          ...          ...                       ...   \n",
       "6214803  8302994       266155 2023-10-23 09:43:25+00:00   \n",
       "6214804  8303307        21355 2023-10-23 09:44:02+00:00   \n",
       "6214805  8304346        21849 2023-10-23 09:45:52+00:00   \n",
       "6214806  8302037       667789 2023-10-23 09:47:32+00:00   \n",
       "6214807  8304939       294154                       NaT   \n",
       "\n",
       "                          sent_at              delivered_at  \\\n",
       "0       2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00   \n",
       "1       2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00   \n",
       "2       2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00   \n",
       "3       2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00   \n",
       "4       2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00   \n",
       "...                           ...                       ...   \n",
       "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00   \n",
       "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00   \n",
       "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00   \n",
       "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00   \n",
       "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00   \n",
       "\n",
       "                                   campaign_name  campaign_service_id  \\\n",
       "0        Le Mucem chez vous, gardons le lien #22                  404   \n",
       "1        Le Mucem chez vous, gardons le lien #22                  404   \n",
       "2        Le Mucem chez vous, gardons le lien #22                  404   \n",
       "3        Le Mucem chez vous, gardons le lien #22                  404   \n",
       "4        Le Mucem chez vous, gardons le lien #22                  404   \n",
       "...                                          ...                  ...   \n",
       "6214803                             dre_nov_2023                 1318   \n",
       "6214804                             dre_nov_2023                 1318   \n",
       "6214805                             dre_nov_2023                 1318   \n",
       "6214806                             dre_nov_2023                 1318   \n",
       "6214807                             dre_nov_2023                 1318   \n",
       "\n",
       "                 campaign_sent_at  \n",
       "0       2021-03-27 23:00:00+00:00  \n",
       "1       2021-03-27 23:00:00+00:00  \n",
       "2       2021-03-27 23:00:00+00:00  \n",
       "3       2021-03-27 23:00:00+00:00  \n",
       "4       2021-03-27 23:00:00+00:00  \n",
       "...                           ...  \n",
       "6214803 2023-10-23 09:31:17+00:00  \n",
       "6214804 2023-10-23 09:31:17+00:00  \n",
       "6214805 2023-10-23 09:31:17+00:00  \n",
       "6214806 2023-10-23 09:31:17+00:00  \n",
       "6214807 2023-10-23 09:31:17+00:00  \n",
       "\n",
       "[6214808 rows x 8 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_campaigns_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def campaigns_kpi_function(campaigns_information = None):\n",
    "    # Nombre de campagnes de mails\n",
    "    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
    "    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
    "    # Temps d'ouverture en min moyen    \n",
    "    campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
    "    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
    "\n",
    "    # Nombre de mail ouvert    \n",
    "    opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
    "    opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
    "    opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
    "    opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
    "\n",
    "    # Fusion des indicateurs\n",
    "    campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
    "    campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
    "\n",
    "    # Remplir les NaN : nb_campaigns_opened\n",
    "    campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
    "\n",
    "    # Remplir les NaT : time_to_open (??)\n",
    "\n",
    "    return campaigns_reduced\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "24537647-bc29-4777-9848-ac4120a4aa60",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2240/3700263836.py:11: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
     ]
    }
   ],
   "source": [
    "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>nb_campaigns</th>\n",
       "      <th>nb_campaigns_opened</th>\n",
       "      <th>time_to_open</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>124.0</td>\n",
       "      <td>1 days 00:28:30.169354838</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1 days 04:31:01.428571428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130467</th>\n",
       "      <td>1256097</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0 days 02:11:15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130468</th>\n",
       "      <td>1256098</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130469</th>\n",
       "      <td>1256099</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130470</th>\n",
       "      <td>1256100</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130471</th>\n",
       "      <td>1256101</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>130472 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        customer_id  nb_campaigns  nb_campaigns_opened  \\\n",
       "0                 2             4                  0.0   \n",
       "1                 3           222                124.0   \n",
       "2                 4             7                  7.0   \n",
       "3                 5             4                  0.0   \n",
       "4                 6            20                  0.0   \n",
       "...             ...           ...                  ...   \n",
       "130467      1256097             1                  1.0   \n",
       "130468      1256098             1                  0.0   \n",
       "130469      1256099             1                  0.0   \n",
       "130470      1256100             1                  0.0   \n",
       "130471      1256101             1                  0.0   \n",
       "\n",
       "                    time_to_open  \n",
       "0                            NaT  \n",
       "1      1 days 00:28:30.169354838  \n",
       "2      1 days 04:31:01.428571428  \n",
       "3                            NaT  \n",
       "4                            NaT  \n",
       "...                          ...  \n",
       "130467           0 days 02:11:15  \n",
       "130468                       NaT  \n",
       "130469                       NaT  \n",
       "130470                       NaT  \n",
       "130471                       NaT  \n",
       "\n",
       "[130472 rows x 4 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_campaigns_kpi"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56520a97-ede8-4920-a211-3b5b136af33d",
   "metadata": {},
   "source": [
    "## Create Products Table"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
   "metadata": {},
   "source": [
    "Some useful functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data\"\n",
    "directory_path = '1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_databases(file_name):\n",
    "    \"\"\"\n",
    "    This function returns the file from s3 storage\n",
    "    \"\"\"\n",
    "    file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "    print(\"File path : \", file_path)\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "    print(\"Shape : \", df.shape)\n",
    "    return df\n",
    "\n",
    "\n",
    "def remove_horodates(df):\n",
    "    \"\"\"\n",
    "    this function remove horodate columns like created_at and updated_at\n",
    "    \"\"\"\n",
    "    df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
    "    return df\n",
    "\n",
    "\n",
    "def order_columns_id(df):\n",
    "    \"\"\"\n",
    "    this function puts all id columns at the beginning in order to read the dataset easier\n",
    "    \"\"\"\n",
    "    substring = 'id'\n",
    "    id_columns = [col for col in df.columns if substring in col]\n",
    "    remaining_col = [col for col in df.columns if substring not in col]\n",
    "    new_order = id_columns + remaining_col\n",
    "    return df[new_order]\n",
    "\n",
    "\n",
    "def process_df_2(df):\n",
    "    \"\"\"\n",
    "    This function organizes dataframe\n",
    "    \"\"\"\n",
    "    df = remove_horodates(df)\n",
    "    print(\"Number of columns : \", len(df.columns))\n",
    "    df = order_columns_id(df)\n",
    "    print(\"Columns : \", df.columns)\n",
    "    return df\n",
    "\n",
    "def load_dataset(name):\n",
    "    \"\"\"\n",
    "    This function loads csv file\n",
    "    \"\"\"\n",
    "    df = display_databases(name)\n",
    "    df = process_df_2(df)\n",
    "    # drop na :\n",
    "    #df = df.dropna(axis=1, thresh=len(df))\n",
    "    # if identifier in table : delete it\n",
    "    if 'identifier' in df.columns:\n",
    "        df = df.drop(columns = 'identifier')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
   "metadata": {},
   "source": [
    "Create theme tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "350b09b9-451f-4d47-81fe-f34b892db027",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_products_table():\n",
    "    # first merge products and categories\n",
    "    print(\"first merge products and categories\")\n",
    "    products = load_dataset(\"1products.csv\")\n",
    "    categories = load_dataset(\"1categories.csv\")\n",
    "    # Drop useless columns\n",
    "    products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
    "    categories = categories.drop(columns = ['extra_field', 'quota'])\n",
    "\n",
    "    #Merge\n",
    "    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
    "                                    right_on = 'id', suffixes=('_products', '_categories'))\n",
    "    products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
    "    \n",
    "    # Second merge products_theme and type of categories\n",
    "    print(\"Second merge products_theme and type of categories\")\n",
    "    type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
    "    type_of_categories = type_of_categories.drop(columns = 'id')\n",
    "    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
    "                                          right_on = 'category_id' )\n",
    "\n",
    "    # Index cleaning\n",
    "    products_theme = products_theme.drop(columns = ['id_categories'])\n",
    "    products_theme  = order_columns_id(products_theme)\n",
    "    return products_theme\n",
    "\n",
    "\n",
    "def create_events_table():\n",
    "    # first merge events and seasons : \n",
    "    print(\"first merge events and seasons : \")\n",
    "    events = load_dataset(\"1events.csv\")\n",
    "    seasons = load_dataset(\"1seasons.csv\")\n",
    "\n",
    "    # Drop useless columns\n",
    "    events = events.drop(columns = ['manual_added', 'is_display'])\n",
    "    seasons = seasons.drop(columns = ['start_date_time'])\n",
    "        \n",
    "    events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
    "\n",
    "    # Secondly merge events_theme and event_types\n",
    "    print(\"Secondly merge events_theme and event_types : \")\n",
    "    event_types = load_dataset(\"1event_types.csv\")\n",
    "    event_types = event_types.drop(columns = ['fidelity_delay'])\n",
    "    \n",
    "    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
    "    events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
    "    events_theme = events_theme.drop(columns = 'id')\n",
    "\n",
    "    # thirdly merge events_theme and facilities\n",
    "    print(\"thirdly merge events_theme and facilities : \")\n",
    "    facilities = load_dataset(\"1facilities.csv\")\n",
    "    facilities = facilities.drop(columns = ['fixed_capacity'])\n",
    "    \n",
    "    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
    "    events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
    "    events_theme = events_theme.drop(columns = 'id')\n",
    "\n",
    "    # Index cleaning\n",
    "    events_theme = events_theme.drop(columns = ['id_seasons'])\n",
    "    events_theme  = order_columns_id(events_theme)\n",
    "    return events_theme\n",
    "\n",
    "\n",
    "def create_representations_table():\n",
    "    representations = load_dataset(\"1representations.csv\")\n",
    "    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
    "                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
    "                                                     'representation_type_id'])\n",
    "    \n",
    "    representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
    "    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
    "\n",
    "    representations_theme = representations.merge(representations_capacity, how='left',\n",
    "                                                  left_on='id', right_on='representation_id',\n",
    "                                                  suffixes=('_representation', '_representation_cap'))\n",
    "    # index cleaning\n",
    "    representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
    "    representations_theme = order_columns_id(representations_theme)\n",
    "    return representations_theme"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "0fccc8ef-e575-4857-a401-94a7274394df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "first merge products and categories\n",
      "File path :  bdc2324-data/1/1products.csv\n",
      "Shape :  (94803, 14)\n",
      "Number of columns :  12\n",
      "Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
      "       'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
      "       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
      "      dtype='object')\n",
      "File path :  bdc2324-data/1/1categories.csv\n",
      "Shape :  (27, 7)\n",
      "Number of columns :  5\n",
      "Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
      "Second merge products_theme and type of categories\n",
      "File path :  bdc2324-data/1/1type_of_categories.csv\n",
      "Shape :  (5, 6)\n",
      "Number of columns :  4\n",
      "Columns :  Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_products</th>\n",
       "      <th>representation_id</th>\n",
       "      <th>pricing_formula_id</th>\n",
       "      <th>category_id</th>\n",
       "      <th>products_group_id</th>\n",
       "      <th>product_pack_id</th>\n",
       "      <th>type_of_id</th>\n",
       "      <th>amount</th>\n",
       "      <th>is_full_price</th>\n",
       "      <th>name_categories</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10682</td>\n",
       "      <td>914</td>\n",
       "      <td>114</td>\n",
       "      <td>41</td>\n",
       "      <td>10655</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>False</td>\n",
       "      <td>indiv activité tr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>478</td>\n",
       "      <td>273</td>\n",
       "      <td>131</td>\n",
       "      <td>1</td>\n",
       "      <td>471</td>\n",
       "      <td>1</td>\n",
       "      <td>12.0</td>\n",
       "      <td>9.5</td>\n",
       "      <td>False</td>\n",
       "      <td>indiv entrées tp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20873</td>\n",
       "      <td>275</td>\n",
       "      <td>137</td>\n",
       "      <td>1</td>\n",
       "      <td>20825</td>\n",
       "      <td>1</td>\n",
       "      <td>12.0</td>\n",
       "      <td>11.5</td>\n",
       "      <td>False</td>\n",
       "      <td>indiv entrées tp</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>157142</td>\n",
       "      <td>82519</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>156773</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.0</td>\n",
       "      <td>False</td>\n",
       "      <td>indiv entrées tr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1341</td>\n",
       "      <td>9</td>\n",
       "      <td>93</td>\n",
       "      <td>1</td>\n",
       "      <td>1175</td>\n",
       "      <td>1</td>\n",
       "      <td>12.0</td>\n",
       "      <td>8.5</td>\n",
       "      <td>False</td>\n",
       "      <td>indiv entrées tp</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id_products  representation_id  pricing_formula_id  category_id  \\\n",
       "0        10682                914                 114           41   \n",
       "1          478                273                 131            1   \n",
       "2        20873                275                 137            1   \n",
       "3       157142              82519                   9            5   \n",
       "4         1341                  9                  93            1   \n",
       "\n",
       "   products_group_id  product_pack_id  type_of_id  amount  is_full_price  \\\n",
       "0              10655                1         NaN     9.0          False   \n",
       "1                471                1        12.0     9.5          False   \n",
       "2              20825                1        12.0    11.5          False   \n",
       "3             156773                1         NaN     8.0          False   \n",
       "4               1175                1        12.0     8.5          False   \n",
       "\n",
       "     name_categories  \n",
       "0  indiv activité tr  \n",
       "1   indiv entrées tp  \n",
       "2   indiv entrées tp  \n",
       "3   indiv entrées tr  \n",
       "4   indiv entrées tp  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "products_theme = create_products_table()\n",
    "products_theme.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "779d8aaf-6668-4f66-8852-847304407ea3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "first merge events and seasons : \n",
      "File path :  bdc2324-data/1/1events.csv\n",
      "Shape :  (1232, 12)\n",
      "Number of columns :  10\n",
      "Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
      "       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
      "      dtype='object')\n",
      "File path :  bdc2324-data/1/1seasons.csv\n",
      "Shape :  (13, 6)\n",
      "Number of columns :  4\n",
      "Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
      "Secondly merge events_theme and event_types : \n",
      "File path :  bdc2324-data/1/1event_types.csv\n",
      "Shape :  (9, 6)\n",
      "Number of columns :  4\n",
      "Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
      "thirdly merge events_theme and facilities : \n",
      "File path :  bdc2324-data/1/1facilities.csv\n",
      "Shape :  (2, 7)\n",
      "Number of columns :  5\n",
      "Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>season_id</th>\n",
       "      <th>facility_id</th>\n",
       "      <th>event_type_id</th>\n",
       "      <th>event_type_key_id</th>\n",
       "      <th>facility_key_id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>name_events</th>\n",
       "      <th>name_seasons</th>\n",
       "      <th>name_event_types</th>\n",
       "      <th>name_facilities</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>192</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>frontières</td>\n",
       "      <td>2018</td>\n",
       "      <td>spectacle vivant</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30329</td>\n",
       "      <td>2767</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>visite guidée une autre histoire du monde (1h00)</td>\n",
       "      <td>2023</td>\n",
       "      <td>offre muséale groupe</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>161</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>visite contée les chercheurs d'or indiv</td>\n",
       "      <td>2018</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5957</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>we dreamt of utopia and we woke up screaming.</td>\n",
       "      <td>2021</td>\n",
       "      <td>spectacle vivant</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8337</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jeff koons épisodes 4</td>\n",
       "      <td>2021</td>\n",
       "      <td>spectacle vivant</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   event_id  season_id  facility_id  event_type_id  event_type_key_id  \\\n",
       "0       192         16            1              4                  4   \n",
       "1     30329       2767            1              5                  5   \n",
       "2       161         16            1              2                  2   \n",
       "3      5957        582            1              4                  4   \n",
       "4      8337        582            1              4                  4   \n",
       "\n",
       "   facility_key_id  street_id  \\\n",
       "0                1          1   \n",
       "1                1          1   \n",
       "2                1          1   \n",
       "3                1          1   \n",
       "4                1          1   \n",
       "\n",
       "                                        name_events name_seasons  \\\n",
       "0                                        frontières         2018   \n",
       "1  visite guidée une autre histoire du monde (1h00)         2023   \n",
       "2           visite contée les chercheurs d'or indiv         2018   \n",
       "3     we dreamt of utopia and we woke up screaming.         2021   \n",
       "4                             jeff koons épisodes 4         2021   \n",
       "\n",
       "           name_event_types name_facilities  \n",
       "0          spectacle vivant           mucem  \n",
       "1      offre muséale groupe           mucem  \n",
       "2  offre muséale individuel           mucem  \n",
       "3          spectacle vivant           mucem  \n",
       "4          spectacle vivant           mucem  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events_theme= create_events_table()\n",
    "events_theme.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1representations.csv\n",
      "Shape :  (36095, 16)\n",
      "Number of columns :  14\n",
      "Columns :  Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
      "       'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
      "       'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
      "      dtype='object')\n",
      "File path :  bdc2324-data/1/1representation_category_capacities.csv\n",
      "Shape :  (65241, 7)\n",
      "Number of columns :  5\n",
      "Columns :  Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
      "       'max_filling'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>id_representation_cap</th>\n",
       "      <th>representation_id</th>\n",
       "      <th>category_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12384</td>\n",
       "      <td>123058</td>\n",
       "      <td>84820</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>37</td>\n",
       "      <td>2514</td>\n",
       "      <td>269</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>37</td>\n",
       "      <td>384</td>\n",
       "      <td>269</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>37</td>\n",
       "      <td>2515</td>\n",
       "      <td>269</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>37</td>\n",
       "      <td>383</td>\n",
       "      <td>269</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   event_id  id_representation_cap  representation_id  category_id\n",
       "0     12384                 123058              84820            2\n",
       "1        37                   2514                269            2\n",
       "2        37                    384                269            5\n",
       "3        37                   2515                269           10\n",
       "4        37                    383                269            1"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "representation_theme = create_representations_table()\n",
    "representation_theme.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
   "metadata": {},
   "source": [
    "Create uniform product database "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "def uniform_product_df():\n",
    "    \"\"\"\n",
    "    This function returns the uniform product dataset\n",
    "    \"\"\"\n",
    "    print(\"Products theme columns : \", products_theme.columns)\n",
    "    print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
    "    print(\"\\n Events theme columns : \", events_theme.columns)\n",
    "\n",
    "    products_global = products_theme.merge(representation_theme, how='left',\n",
    "                                           on= [\"representation_id\", \"category_id\"])\n",
    "    \n",
    "    products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
    "                                            suffixes = (\"_representation\", \"_event\"))\n",
    "    \n",
    "    products_global = order_columns_id(products_global)\n",
    "\n",
    "    # remove useless columns \n",
    "    products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n",
    "    return products_global"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Products theme columns :  Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
      "       'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
      "       'is_full_price', 'name_categories'],\n",
      "      dtype='object')\n",
      "\n",
      " Representation theme columns :  Index(['event_id', 'id_representation_cap', 'representation_id',\n",
      "       'category_id'],\n",
      "      dtype='object')\n",
      "\n",
      " Events theme columns :  Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
      "       'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
      "       'name_seasons', 'name_event_types', 'name_facilities'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_products</th>\n",
       "      <th>representation_id</th>\n",
       "      <th>pricing_formula_id</th>\n",
       "      <th>category_id</th>\n",
       "      <th>products_group_id</th>\n",
       "      <th>product_pack_id</th>\n",
       "      <th>event_id</th>\n",
       "      <th>id_representation_cap</th>\n",
       "      <th>season_id</th>\n",
       "      <th>facility_id</th>\n",
       "      <th>event_type_id</th>\n",
       "      <th>event_type_key_id</th>\n",
       "      <th>facility_key_id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>amount</th>\n",
       "      <th>is_full_price</th>\n",
       "      <th>name_event_types</th>\n",
       "      <th>name_facilities</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10682</td>\n",
       "      <td>914</td>\n",
       "      <td>114</td>\n",
       "      <td>41</td>\n",
       "      <td>10655</td>\n",
       "      <td>1</td>\n",
       "      <td>132</td>\n",
       "      <td>8789</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>9.0</td>\n",
       "      <td>False</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>478</td>\n",
       "      <td>273</td>\n",
       "      <td>131</td>\n",
       "      <td>1</td>\n",
       "      <td>471</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>390</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>9.5</td>\n",
       "      <td>False</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20873</td>\n",
       "      <td>275</td>\n",
       "      <td>137</td>\n",
       "      <td>1</td>\n",
       "      <td>20825</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>395</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>11.5</td>\n",
       "      <td>False</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>157142</td>\n",
       "      <td>82519</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>156773</td>\n",
       "      <td>1</td>\n",
       "      <td>12365</td>\n",
       "      <td>120199</td>\n",
       "      <td>1754</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>False</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1341</td>\n",
       "      <td>9</td>\n",
       "      <td>93</td>\n",
       "      <td>1</td>\n",
       "      <td>1175</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>21</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>8.5</td>\n",
       "      <td>False</td>\n",
       "      <td>non défini</td>\n",
       "      <td>mucem</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id_products  representation_id  pricing_formula_id  category_id  \\\n",
       "0        10682                914                 114           41   \n",
       "1          478                273                 131            1   \n",
       "2        20873                275                 137            1   \n",
       "3       157142              82519                   9            5   \n",
       "4         1341                  9                  93            1   \n",
       "\n",
       "   products_group_id  product_pack_id  event_id  id_representation_cap  \\\n",
       "0              10655                1       132                   8789   \n",
       "1                471                1        37                    390   \n",
       "2              20825                1        37                    395   \n",
       "3             156773                1     12365                 120199   \n",
       "4               1175                1         8                     21   \n",
       "\n",
       "   season_id  facility_id  event_type_id  event_type_key_id  facility_key_id  \\\n",
       "0          4            1              2                  5                1   \n",
       "1          2            1              2                  2                1   \n",
       "2          2            1              2                  2                1   \n",
       "3       1754            1              2                  4                1   \n",
       "4          4            1              3                  6                1   \n",
       "\n",
       "   street_id  amount  is_full_price          name_event_types name_facilities  \n",
       "0          1     9.0          False  offre muséale individuel           mucem  \n",
       "1          1     9.5          False  offre muséale individuel           mucem  \n",
       "2          1    11.5          False  offre muséale individuel           mucem  \n",
       "3          1     8.0          False  offre muséale individuel           mucem  \n",
       "4          1     8.5          False                non défini           mucem  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "products_global = uniform_product_df()\n",
    "products_global.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
   "metadata": {},
   "source": [
    "# Fusion des bases locales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fusion liée au product\n",
    "df1_product_purchased = pd.merge(df1_tickets_kpi, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
    "\n",
    "# Fusion liée au customer\n",
    "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
    "\n",
    "# Fusion product et customer\n",
    "df1_customer_product = pd.merge(df1_customer, df1_product_purchased, on = 'customer_id', how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e42a790-b215-4107-a969-85005da06ebd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}