BDC-team-1/Spectacle/Exploration_spectacle.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0eefb67b-5399-44fa-9c1c-7724ec1c7cd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import s3fs\n",
    "import warnings\n",
    "from datetime import date, timedelta, datetime\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "37977b4e-42e7-4d8e-8b9a-6843292fd128",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import KPI construction functions\n",
    "#exec(open('0_KPI_functions.py').read())\n",
    "exec(open('../0_KPI_functions.py').read())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cca62d72-f809-41a9-bb06-1be7d6b09307",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv',\n",
       " 'projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv',\n",
       " 'projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv',\n",
       " 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "BUCKET = \"projet-bdc2324-team1/0_Input/Company_10\"\n",
    "fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0e1ce56c-2e50-456c-ba97-ed4a699cc8d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Input/Company_10/customerplus_cleaned.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    df_customerplus_cleaned = pd.read_csv(file_in, sep=\",\")\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "bcdba447-90f7-450c-b4a3-6da656e38493",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_491/3710670046.py:6: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "  purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'], date_parser=custom_date_parser)\n"
     ]
    }
   ],
   "source": [
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Input/Company_10/products_purchased_reduced.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'], date_parser=custom_date_parser)\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "637aa400-f49a-4d8d-802a-868b241f8a9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "dic_base=['campaigns_information','customerplus_cleaned','products_purchased_reduced','target_information']\n",
    "for nom_base in dic_base:\n",
    "    FILE_PATH_S3_fanta = 'projet-bdc2324-team1/0_Input/Company_10/' + nom_base + '.csv'\n",
    "    with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n",
    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e60529b5-986f-4685-91e1-782c2b022e09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>target_name</th>\n",
       "      <th>target_type_is_import</th>\n",
       "      <th>target_type_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1165098</td>\n",
       "      <td>618562</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1165100</td>\n",
       "      <td>618559</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1165101</td>\n",
       "      <td>618561</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1165102</td>\n",
       "      <td>618560</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1165103</td>\n",
       "      <td>618558</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69253</th>\n",
       "      <td>1698158</td>\n",
       "      <td>18580</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69254</th>\n",
       "      <td>1698159</td>\n",
       "      <td>18569</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69255</th>\n",
       "      <td>1698160</td>\n",
       "      <td>2962</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69256</th>\n",
       "      <td>1698161</td>\n",
       "      <td>3825</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69257</th>\n",
       "      <td>1698162</td>\n",
       "      <td>5731</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>69258 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            id  customer_id           target_name  target_type_is_import  \\\n",
       "0      1165098       618562  Newsletter mensuelle                  False   \n",
       "1      1165100       618559  Newsletter mensuelle                  False   \n",
       "2      1165101       618561  Newsletter mensuelle                  False   \n",
       "3      1165102       618560  Newsletter mensuelle                  False   \n",
       "4      1165103       618558  Newsletter mensuelle                  False   \n",
       "...        ...          ...                   ...                    ...   \n",
       "69253  1698158        18580  Newsletter mensuelle                  False   \n",
       "69254  1698159        18569  Newsletter mensuelle                  False   \n",
       "69255  1698160         2962  Newsletter mensuelle                  False   \n",
       "69256  1698161         3825  Newsletter mensuelle                  False   \n",
       "69257  1698162         5731  Newsletter mensuelle                  False   \n",
       "\n",
       "           target_type_name  \n",
       "0      manual_static_filter  \n",
       "1      manual_static_filter  \n",
       "2      manual_static_filter  \n",
       "3      manual_static_filter  \n",
       "4      manual_static_filter  \n",
       "...                     ...  \n",
       "69253  manual_static_filter  \n",
       "69254  manual_static_filter  \n",
       "69255  manual_static_filter  \n",
       "69256  manual_static_filter  \n",
       "69257  manual_static_filter  \n",
       "\n",
       "[69258 rows x 5 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "6ece1bb3-5a2d-41f8-be96-eb70697881dc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>nb_campaigns</th>\n",
       "      <th>nb_campaigns_opened</th>\n",
       "      <th>time_to_open</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>37</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>39</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0 days 05:16:38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>41</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0 days 01:12:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>44</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57138</th>\n",
       "      <td>827940</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57139</th>\n",
       "      <td>827941</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57140</th>\n",
       "      <td>827942</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57141</th>\n",
       "      <td>827943</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57142</th>\n",
       "      <td>827944</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>57143 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       customer_id  nb_campaigns  nb_campaigns_opened    time_to_open\n",
       "0               29             4                  NaN             NaT\n",
       "1               37             3                  NaN             NaT\n",
       "2               39             4                  1.0 0 days 05:16:38\n",
       "3               41             4                  1.0 0 days 01:12:29\n",
       "4               44             4                  NaN             NaT\n",
       "...            ...           ...                  ...             ...\n",
       "57138       827940             1                  NaN             NaT\n",
       "57139       827941             1                  NaN             NaT\n",
       "57140       827942             1                  NaN             NaT\n",
       "57141       827943             1                  NaN             NaT\n",
       "57142       827944             1                  NaN             NaT\n",
       "\n",
       "[57143 rows x 4 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "campaigns_kpi_function(campaigns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "8c42f4a3-bdbc-44fe-a873-3192b983410d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# KPI sur le comportement d'achat\n",
    "df_tickets_kpi = tickets_kpi_function(purchases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "df124880-1e4f-4eaf-b0ef-72bb4f840d45",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "customer_id              0\n",
       "nb_tickets               0\n",
       "nb_purchases             0\n",
       "total_amount             0\n",
       "nb_suppliers             0\n",
       "vente_internet_max       0\n",
       "purchase_date_min        0\n",
       "purchase_date_max        0\n",
       "time_between_purchase    0\n",
       "nb_tickets_internet      0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tickets_kpi.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "7e2ab67d-1cf6-41de-804e-23c14e0be7d5",
   "metadata": {},
   "outputs": [],
   "source": [
    " # KPI sur le comportement d'achat\n",
    "    \n",
    "df_tickets_kpi = tickets_kpi_function(tickets_information = purchases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "7be68aa3-16de-4319-93d4-0c28258e3dd8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>nb_tickets</th>\n",
       "      <th>nb_purchases</th>\n",
       "      <th>total_amount</th>\n",
       "      <th>nb_suppliers</th>\n",
       "      <th>vente_internet_max</th>\n",
       "      <th>purchase_date_min</th>\n",
       "      <th>purchase_date_max</th>\n",
       "      <th>time_between_purchase</th>\n",
       "      <th>nb_tickets_internet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19482</td>\n",
       "      <td>88</td>\n",
       "      <td>29</td>\n",
       "      <td>872.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2643.092500</td>\n",
       "      <td>718.149398</td>\n",
       "      <td>1924.943102</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19484</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>62.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1745.021736</td>\n",
       "      <td>1743.045035</td>\n",
       "      <td>1.976701</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19485</td>\n",
       "      <td>131</td>\n",
       "      <td>21</td>\n",
       "      <td>1878.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2649.044745</td>\n",
       "      <td>85.240845</td>\n",
       "      <td>2563.803900</td>\n",
       "      <td>84.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>19486</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>96.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1944.077604</td>\n",
       "      <td>1742.794225</td>\n",
       "      <td>201.283380</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>19487</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1742.877766</td>\n",
       "      <td>1742.877766</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26100</th>\n",
       "      <td>824877</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5.956111</td>\n",
       "      <td>5.956111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26101</th>\n",
       "      <td>824878</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5.956921</td>\n",
       "      <td>5.956921</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26102</th>\n",
       "      <td>824879</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>-38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5.226238</td>\n",
       "      <td>5.226238</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26103</th>\n",
       "      <td>824991</td>\n",
       "      <td>14</td>\n",
       "      <td>3</td>\n",
       "      <td>-100.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3.021539</td>\n",
       "      <td>3.017222</td>\n",
       "      <td>0.004317</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26104</th>\n",
       "      <td>824998</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.072720</td>\n",
       "      <td>0.072720</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>26105 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       customer_id  nb_tickets  nb_purchases  total_amount  nb_suppliers  \\\n",
       "0            19482          88            29         872.0             2   \n",
       "1            19484           3             2          62.0             1   \n",
       "2            19485         131            21        1878.0             2   \n",
       "3            19486          10             4          96.0             1   \n",
       "4            19487           2             1          33.0             1   \n",
       "...            ...         ...           ...           ...           ...   \n",
       "26100       824877           1             1         -12.0             1   \n",
       "26101       824878           1             1          12.0             1   \n",
       "26102       824879           2             1         -38.0             1   \n",
       "26103       824991          14             3        -100.0             1   \n",
       "26104       824998           1             1          25.0             1   \n",
       "\n",
       "       vente_internet_max  purchase_date_min  purchase_date_max  \\\n",
       "0                       1        2643.092500         718.149398   \n",
       "1                       0        1745.021736        1743.045035   \n",
       "2                       1        2649.044745          85.240845   \n",
       "3                       0        1944.077604        1742.794225   \n",
       "4                       0        1742.877766        1742.877766   \n",
       "...                   ...                ...                ...   \n",
       "26100                   0           5.956111           5.956111   \n",
       "26101                   0           5.956921           5.956921   \n",
       "26102                   0           5.226238           5.226238   \n",
       "26103                   0           3.021539           3.017222   \n",
       "26104                   0           0.072720           0.072720   \n",
       "\n",
       "       time_between_purchase  nb_tickets_internet  \n",
       "0                1924.943102                  8.0  \n",
       "1                   1.976701                  0.0  \n",
       "2                2563.803900                 84.0  \n",
       "3                 201.283380                  0.0  \n",
       "4                   0.000000                  0.0  \n",
       "...                      ...                  ...  \n",
       "26100               0.000000                  0.0  \n",
       "26101               0.000000                  0.0  \n",
       "26102               0.000000                  0.0  \n",
       "26103               0.004317                  0.0  \n",
       "26104               0.000000                  0.0  \n",
       "\n",
       "[26105 rows x 10 columns]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tickets_kpi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "4e8c0d75-117f-4400-8d55-b3ae3f43501b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>structure_id</th>\n",
       "      <th>mcp_contact_id</th>\n",
       "      <th>fidelity</th>\n",
       "      <th>tenant_id</th>\n",
       "      <th>is_partner</th>\n",
       "      <th>deleted_at</th>\n",
       "      <th>gender</th>\n",
       "      <th>is_email_true</th>\n",
       "      <th>...</th>\n",
       "      <th>total_price</th>\n",
       "      <th>purchase_count</th>\n",
       "      <th>first_buying_date</th>\n",
       "      <th>country</th>\n",
       "      <th>gender_label</th>\n",
       "      <th>gender_female</th>\n",
       "      <th>gender_male</th>\n",
       "      <th>gender_other</th>\n",
       "      <th>country_fr</th>\n",
       "      <th>has_tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>821538</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>809126</td>\n",
       "      <td>1063</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11005</td>\n",
       "      <td>1063</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17663</td>\n",
       "      <td>12731</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>38100</td>\n",
       "      <td>12395</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98789</th>\n",
       "      <td>766266</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>181304.0</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98790</th>\n",
       "      <td>766336</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>178189.0</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98791</th>\n",
       "      <td>766348</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>178141.0</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98792</th>\n",
       "      <td>766363</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>176807.0</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98793</th>\n",
       "      <td>766366</td>\n",
       "      <td>139</td>\n",
       "      <td>NaN</td>\n",
       "      <td>176788.0</td>\n",
       "      <td>0</td>\n",
       "      <td>875</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>98794 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       customer_id  street_id  structure_id  mcp_contact_id  fidelity  \\\n",
       "0           821538        139           NaN             NaN         0   \n",
       "1           809126       1063           NaN             NaN         0   \n",
       "2            11005       1063           NaN             NaN         0   \n",
       "3            17663      12731           NaN             NaN         0   \n",
       "4            38100      12395           NaN             NaN         0   \n",
       "...            ...        ...           ...             ...       ...   \n",
       "98789       766266        139           NaN        181304.0         0   \n",
       "98790       766336        139           NaN        178189.0         0   \n",
       "98791       766348        139           NaN        178141.0         0   \n",
       "98792       766363        139           NaN        176807.0         0   \n",
       "98793       766366        139           NaN        176788.0         0   \n",
       "\n",
       "       tenant_id  is_partner  deleted_at  gender  is_email_true  ...  \\\n",
       "0            875       False         NaN       2           True  ...   \n",
       "1            875       False         NaN       2           True  ...   \n",
       "2            875       False         NaN       2          False  ...   \n",
       "3            875       False         NaN       0          False  ...   \n",
       "4            875       False         NaN       0           True  ...   \n",
       "...          ...         ...         ...     ...            ...  ...   \n",
       "98789        875       False         NaN       2           True  ...   \n",
       "98790        875       False         NaN       2           True  ...   \n",
       "98791        875       False         NaN       2           True  ...   \n",
       "98792        875       False         NaN       2           True  ...   \n",
       "98793        875       False         NaN       2           True  ...   \n",
       "\n",
       "       total_price purchase_count  first_buying_date  country  gender_label  \\\n",
       "0              0.0              0                NaN      NaN         other   \n",
       "1              0.0              0                NaN       fr         other   \n",
       "2              NaN             14                NaN       fr         other   \n",
       "3              NaN              1                NaN       fr        female   \n",
       "4              NaN              1                NaN       fr        female   \n",
       "...            ...            ...                ...      ...           ...   \n",
       "98789          0.0              0                NaN      NaN         other   \n",
       "98790          0.0              0                NaN      NaN         other   \n",
       "98791          0.0              0                NaN      NaN         other   \n",
       "98792          0.0              0                NaN      NaN         other   \n",
       "98793          0.0              0                NaN      NaN         other   \n",
       "\n",
       "       gender_female  gender_male  gender_other  country_fr  has_tags  \n",
       "0                  0            0             1         NaN         0  \n",
       "1                  0            0             1         1.0         0  \n",
       "2                  0            0             1         1.0         0  \n",
       "3                  1            0             0         1.0         0  \n",
       "4                  1            0             0         1.0         0  \n",
       "...              ...          ...           ...         ...       ...  \n",
       "98789              0            0             1         NaN         0  \n",
       "98790              0            0             1         NaN         0  \n",
       "98791              0            0             1         NaN         0  \n",
       "98792              0            0             1         NaN         0  \n",
       "98793              0            0             1         NaN         0  \n",
       "\n",
       "[98794 rows x 28 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " # KPI sur les données socio-démographiques\n",
    "df_customerplus_clean = customerplus_kpi_function(df_customerplus_cleaned)\n",
    " \n",
    "df_customerplus_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "59e3a6f5-97e6-48c6-b3f8-4333a0d94eb5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "customer_id                   0\n",
       "street_id                     0\n",
       "structure_id              96706\n",
       "mcp_contact_id            19094\n",
       "fidelity                      0\n",
       "tenant_id                     0\n",
       "is_partner                    0\n",
       "deleted_at                98794\n",
       "gender                        0\n",
       "is_email_true                 0\n",
       "opt_in                        0\n",
       "last_buying_date          73081\n",
       "max_price                 73081\n",
       "ticket_sum                    0\n",
       "average_price             35539\n",
       "average_purchase_delay    73081\n",
       "average_price_basket      73081\n",
       "average_ticket_basket     73081\n",
       "total_price               37542\n",
       "purchase_count                0\n",
       "first_buying_date         73081\n",
       "country                   44192\n",
       "gender_label                  0\n",
       "gender_female                 0\n",
       "gender_male                   0\n",
       "gender_other                  0\n",
       "country_fr                44192\n",
       "has_tags                      0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_customerplus_clean.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "484979cc-d4a4-4d9d-9701-71a4f353a372",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_438/1359829443.py:6: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "  campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = [\"opened_at\", \"sent_at\", \"delivered_at\"], date_parser=custom_date_parser)\n"
     ]
    }
   ],
   "source": [
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Input/Company_10/campaigns_information.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = [\"opened_at\", \"sent_at\", \"delivered_at\"], date_parser=custom_date_parser)\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "553ca2e7-ead4-4508-8247-fcc602abd249",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Input/Company_10/target_information.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    targets = pd.read_csv(file_in, sep=\",\")\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "17b89ca1-deea-4139-a6c0-7822cc4e7a90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>target_name</th>\n",
       "      <th>target_type_is_import</th>\n",
       "      <th>target_type_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1165098</td>\n",
       "      <td>618562</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1165100</td>\n",
       "      <td>618559</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1165101</td>\n",
       "      <td>618561</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1165102</td>\n",
       "      <td>618560</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1165103</td>\n",
       "      <td>618558</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69253</th>\n",
       "      <td>1698158</td>\n",
       "      <td>18580</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69254</th>\n",
       "      <td>1698159</td>\n",
       "      <td>18569</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69255</th>\n",
       "      <td>1698160</td>\n",
       "      <td>2962</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69256</th>\n",
       "      <td>1698161</td>\n",
       "      <td>3825</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69257</th>\n",
       "      <td>1698162</td>\n",
       "      <td>5731</td>\n",
       "      <td>Newsletter mensuelle</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>69258 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            id  customer_id           target_name  target_type_is_import  \\\n",
       "0      1165098       618562  Newsletter mensuelle                  False   \n",
       "1      1165100       618559  Newsletter mensuelle                  False   \n",
       "2      1165101       618561  Newsletter mensuelle                  False   \n",
       "3      1165102       618560  Newsletter mensuelle                  False   \n",
       "4      1165103       618558  Newsletter mensuelle                  False   \n",
       "...        ...          ...                   ...                    ...   \n",
       "69253  1698158        18580  Newsletter mensuelle                  False   \n",
       "69254  1698159        18569  Newsletter mensuelle                  False   \n",
       "69255  1698160         2962  Newsletter mensuelle                  False   \n",
       "69256  1698161         3825  Newsletter mensuelle                  False   \n",
       "69257  1698162         5731  Newsletter mensuelle                  False   \n",
       "\n",
       "           target_type_name  \n",
       "0      manual_static_filter  \n",
       "1      manual_static_filter  \n",
       "2      manual_static_filter  \n",
       "3      manual_static_filter  \n",
       "4      manual_static_filter  \n",
       "...                     ...  \n",
       "69253  manual_static_filter  \n",
       "69254  manual_static_filter  \n",
       "69255  manual_static_filter  \n",
       "69256  manual_static_filter  \n",
       "69257  manual_static_filter  \n",
       "\n",
       "[69258 rows x 5 columns]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "27a3c2bf-0541-43b4-b62d-4621692f6c66",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.reset_option('display.max_rows',70000)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "51e57220-021f-4b0f-a2c9-360d612c9f75",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       Newsletter mensuelle\n",
       "1       Newsletter mensuelle\n",
       "2       Newsletter mensuelle\n",
       "3       Newsletter mensuelle\n",
       "4       Newsletter mensuelle\n",
       "                ...         \n",
       "9995    Newsletter mensuelle\n",
       "9996    Newsletter mensuelle\n",
       "9997    Newsletter mensuelle\n",
       "9998    Newsletter mensuelle\n",
       "9999    Newsletter mensuelle\n",
       "Name: target_name, Length: 10000, dtype: object"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "targets[\"target_name\"].head(10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "db3748e6-795e-459c-86dd-3389455af217",
   "metadata": {},
   "outputs": [],
   "source": [
    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
    "            'sport': ['5', '6', '7', '8', '9'],\n",
    "            'musique' : ['10', '11', '12', '13', '14']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "d6767ba6-94ef-43f9-8f67-15ecdb41a70b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
     ]
    }
   ],
   "source": [
    "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
    "list_of_comp = companies[type_of_comp] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "050963aa-5cdc-4ff2-a380-16efec89adf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dossier d'exportation\n",
    "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "21a32b69-de53-45ce-9e31-22c45c223924",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'projet-bdc2324-team1/Generalization/musique'"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "BUCKET_OUT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "177c4742-5ec6-4326-b984-09e673791801",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'projet-bdc2324-team1/Generalization/musique'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'projet-bdc2324-team1/Generalization/musique'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "80c6d397-117e-493d-ab0f-7698dbfa8cc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_covering_time(df, company, datecover):\n",
    "    \"\"\"\n",
    "    This function draws the time coverage of each company\n",
    "    \"\"\"\n",
    "    min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
    "    max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
    "    datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
    "    print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
    "    return datecover\n",
    "\n",
    "\n",
    "def compute_time_intersection(datecover):\n",
    "    \"\"\"\n",
    "    This function returns the time coverage for all companies\n",
    "    \"\"\"\n",
    "    timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
    "    intersection = set.intersection(*timestamps_sets)\n",
    "    intersection_list = list(intersection)\n",
    "    formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
    "    return sorted(formated_dates)\n",
    "\n",
    "\n",
    "def df_coverage_modelization(sport, coverage_train = 0.7):\n",
    "    \"\"\"\n",
    "    This function returns start_date, end_of_features and final dates\n",
    "    that help to construct train and test datasets\n",
    "    \"\"\"\n",
    "    datecover = {}\n",
    "    for company in sport:\n",
    "        df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
    "                                                          datetime_col = ['purchase_date'])\n",
    "        datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
    "    #print(datecover.keys())\n",
    "    dt_coverage = compute_time_intersection(datecover)\n",
    "    start_date = dt_coverage[0]\n",
    "    end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
    "    final_date = dt_coverage[-1]\n",
    "    return start_date, end_of_features, final_date\n",
    "    \n",
    "\n",
    "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
    "    \n",
    "    # Import customerplus\n",
    "    df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
    "    df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
    "    df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
    "    \n",
    "    # Filtre de cohérence pour la mise en pratique de notre méthode\n",
    "    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
    "    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
    "    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
    "\n",
    "    #Filtre de la base df_campaigns_information\n",
    "    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
    "    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
    "    \n",
    "    #Filtre de la base df_products_purchased_reduced\n",
    "    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
    "\n",
    "    print(\"Data filtering : SUCCESS\")\n",
    "    \n",
    "    # Fusion de l'ensemble et creation des KPI\n",
    "\n",
    "    # KPI sur les campagnes publicitaires\n",
    "    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
    "\n",
    "    # KPI sur le comportement d'achat\n",
    "    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
    "\n",
    "    # KPI sur les données socio-démographiques\n",
    "    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
    "    \n",
    "    print(\"KPIs construction : SUCCESS\")\n",
    "    \n",
    "    # Fusion avec KPI liés au customer\n",
    "    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
    "    \n",
    "    # Fill NaN values\n",
    "    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
    "    \n",
    "    # Fusion avec KPI liés au comportement d'achat\n",
    "    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
    "    \n",
    "    # Fill NaN values\n",
    "    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
    "\n",
    "    print(\"Explanatory variable construction : SUCCESS\")\n",
    "\n",
    "    # 2. Construction of the explained variable \n",
    "    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
    "\n",
    "    # Indicatrice d'achat\n",
    "    df_products_purchased_to_predict['y_has_purchased'] = 1\n",
    "\n",
    "    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
    "\n",
    "    print(\"Explained variable construction : SUCCESS\")\n",
    "    \n",
    "    # 3. Merge between explained and explanatory variables\n",
    "    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
    "\n",
    "    # 0 if there is no purchase\n",
    "    dataset[['y_has_purchased']].fillna(0)    \n",
    "    \n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "2a746097-0cbf-4bd6-b13b-6ee3e5c36fad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "<string>:13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "<string>:13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
      "\n",
      "df[\"col\"][row_indexer] = value\n",
      "\n",
      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data filtering : SUCCESS\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
      "\n",
      "df[\"col\"][row_indexer] = value\n",
      "\n",
      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data filtering : SUCCESS\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "<string>:13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
      "\n",
      "df[\"col\"][row_indexer] = value\n",
      "\n",
      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
      "\n",
      "df[\"col\"][row_indexer] = value\n",
      "\n",
      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data filtering : SUCCESS\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
      "<string>:13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
      "\n",
      "df[\"col\"][row_indexer] = value\n",
      "\n",
      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data filtering : SUCCESS\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n"
     ]
    }
   ],
   "source": [
    "# Create test dataset and train dataset for sport companies\n",
    "\n",
    "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
    "\n",
    "for company in list_of_comp:\n",
    "    dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
    "                                        max_date = final_date, directory_path = company) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "01900e04-61e7-4a1b-8c9c-b72e42ba9507",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Exportation dataset test : SUCCESS\n"
     ]
    }
   ],
   "source": [
    "  # Exportation\n",
    "FILE_KEY_OUT_S3 = \"dataset_test\" + company +  \".csv\"\n",
    "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
    "    \n",
    "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
    "    dataset_test.to_csv(file_out, index = False)\n",
    "    \n",
    "print(\"Exportation dataset test : SUCCESS\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "b0de2e18-edff-416c-b623-e3e23016029d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'projet-bdc2324-team1/Generalization/musique/dataset_test14.csv'"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FILE_PATH_OUT_S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "8f56d6ee-82c9-43e2-813d-33d6aaa458dd",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'dataset_test14' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[105], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdataset_test14\u001b[49m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'dataset_test14' is not defined"
     ]
    }
   ],
   "source": [
    "dataset_test14"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9232a8df-c51a-4f10-9fc8-ce4f8ad8aab4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}