From 12427e7b18ea9e8acaf6644d20bacd681fe1b212 Mon Sep 17 00:00:00 2001
From: frodrigue-ensae <fanta.rodrigue@ensae.fr>
Date: Wed, 28 Feb 2024 05:51:50 +0000
Subject: [PATCH] stat

---
 Spectacle/Exploration_spectacle.ipynb | 1392 +++++++++++++++++++------
 1 file changed, 1089 insertions(+), 303 deletions(-)
diff --git a/Spectacle/Exploration_spectacle.ipynb b/Spectacle/Exploration_spectacle.ipynb
index 6324287..1e42d16 100644
--- a/Spectacle/Exploration_spectacle.ipynb
+++ b/Spectacle/Exploration_spectacle.ipynb
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 70,
    "id": "cca62d72-f809-41a9-bb06-1be7d6b09307",
    "metadata": {},
    "outputs": [
@@ -42,7 +42,7 @@
        " 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 70,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 71,
    "id": "68fb54f3-8eb3-4cd0-966b-000876912fb5",
    "metadata": {},
    "outputs": [
@@ -83,149 +83,119 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
+       "      <th>ticket_id</th>\n",
        "      <th>customer_id</th>\n",
-       "      <th>street_id</th>\n",
-       "      <th>structure_id</th>\n",
-       "      <th>mcp_contact_id</th>\n",
-       "      <th>fidelity</th>\n",
-       "      <th>tenant_id</th>\n",
-       "      <th>is_partner</th>\n",
-       "      <th>deleted_at</th>\n",
-       "      <th>gender</th>\n",
-       "      <th>is_email_true</th>\n",
-       "      <th>...</th>\n",
-       "      <th>max_price</th>\n",
-       "      <th>ticket_sum</th>\n",
-       "      <th>average_price</th>\n",
-       "      <th>average_purchase_delay</th>\n",
-       "      <th>average_price_basket</th>\n",
-       "      <th>average_ticket_basket</th>\n",
-       "      <th>total_price</th>\n",
-       "      <th>purchase_count</th>\n",
-       "      <th>first_buying_date</th>\n",
-       "      <th>country</th>\n",
+       "      <th>purchase_id</th>\n",
+       "      <th>event_type_id</th>\n",
+       "      <th>supplier_name</th>\n",
+       "      <th>purchase_date</th>\n",
+       "      <th>amount</th>\n",
+       "      <th>is_full_price</th>\n",
+       "      <th>name_event_types</th>\n",
+       "      <th>name_facilities</th>\n",
+       "      <th>name_categories</th>\n",
+       "      <th>name_events</th>\n",
+       "      <th>name_seasons</th>\n",
+       "      <th>start_date_time</th>\n",
+       "      <th>end_date_time</th>\n",
+       "      <th>open</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>821538</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
-       "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>1799177</td>\n",
+       "      <td>36984</td>\n",
+       "      <td>409613</td>\n",
        "      <td>2</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2016-04-28 17:58:26+02:00</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>danse</td>\n",
+       "      <td>le grand t</td>\n",
+       "      <td>abo t gourmand jeune</td>\n",
+       "      <td>aringa rossa</td>\n",
+       "      <td>test 2016/2017</td>\n",
+       "      <td>2016-09-27 00:00:00+02:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>809126</td>\n",
-       "      <td>1063</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <td>1799178</td>\n",
+       "      <td>36984</td>\n",
+       "      <td>409613</td>\n",
+       "      <td>3</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2016-04-28 17:58:26+02:00</td>\n",
+       "      <td>9.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>cirque</td>\n",
+       "      <td>le grand t</td>\n",
+       "      <td>abo t gourmand jeune</td>\n",
+       "      <td>5èmes hurlants</td>\n",
+       "      <td>test 2016/2017</td>\n",
+       "      <td>2016-11-18 00:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>fr</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>11005</td>\n",
-       "      <td>1063</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <td>1799179</td>\n",
+       "      <td>36984</td>\n",
+       "      <td>409613</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2016-04-28 17:58:26+02:00</td>\n",
+       "      <td>9.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>14</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>fr</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>le grand t</td>\n",
+       "      <td>abo t gourmand jeune</td>\n",
+       "      <td>dom juan</td>\n",
+       "      <td>test 2016/2017</td>\n",
+       "      <td>2016-12-07 00:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>17663</td>\n",
-       "      <td>12731</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
-       "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>1799180</td>\n",
+       "      <td>36984</td>\n",
+       "      <td>409613</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>fr</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2016-04-28 17:58:26+02:00</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>le grand t</td>\n",
+       "      <td>abo t gourmand jeune</td>\n",
+       "      <td>vanishing point</td>\n",
+       "      <td>test 2016/2017</td>\n",
+       "      <td>2017-01-04 00:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>38100</td>\n",
-       "      <td>12395</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <td>1799181</td>\n",
+       "      <td>36984</td>\n",
+       "      <td>409613</td>\n",
+       "      <td>3</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2016-04-28 17:58:26+02:00</td>\n",
+       "      <td>12.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
+       "      <td>cirque</td>\n",
+       "      <td>la cite des congres</td>\n",
+       "      <td>abo t gourmand jeune</td>\n",
+       "      <td>a o lang pho</td>\n",
+       "      <td>test 2016/2017</td>\n",
+       "      <td>2017-01-03 00:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>fr</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -245,213 +215,183 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>98789</th>\n",
-       "      <td>766266</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>181304.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <th>492309</th>\n",
+       "      <td>3252232</td>\n",
+       "      <td>621716</td>\n",
+       "      <td>710062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2023-03-09 12:08:45+01:00</td>\n",
+       "      <td>7.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>cap nort</td>\n",
+       "      <td>tarif sco co 1 seance scolaire</td>\n",
+       "      <td>sur moi, le temps</td>\n",
+       "      <td>2022/2023</td>\n",
+       "      <td>2023-03-13 14:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>98790</th>\n",
-       "      <td>766336</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>178189.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <th>492310</th>\n",
+       "      <td>3252233</td>\n",
+       "      <td>621716</td>\n",
+       "      <td>710062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2023-03-09 12:08:45+01:00</td>\n",
+       "      <td>7.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>cap nort</td>\n",
+       "      <td>tarif sco co 1 seance scolaire</td>\n",
+       "      <td>sur moi, le temps</td>\n",
+       "      <td>2022/2023</td>\n",
+       "      <td>2023-03-13 14:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>98791</th>\n",
-       "      <td>766348</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>178141.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <th>492311</th>\n",
+       "      <td>3252234</td>\n",
+       "      <td>621716</td>\n",
+       "      <td>710062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2023-03-09 12:08:45+01:00</td>\n",
+       "      <td>7.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>cap nort</td>\n",
+       "      <td>tarif sco co 1 seance scolaire</td>\n",
+       "      <td>sur moi, le temps</td>\n",
+       "      <td>2022/2023</td>\n",
+       "      <td>2023-03-13 14:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>98792</th>\n",
-       "      <td>766363</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>176807.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <th>492312</th>\n",
+       "      <td>3252235</td>\n",
+       "      <td>621716</td>\n",
+       "      <td>710062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2023-03-09 12:08:45+01:00</td>\n",
+       "      <td>7.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>cap nort</td>\n",
+       "      <td>tarif sco co 1 seance scolaire</td>\n",
+       "      <td>sur moi, le temps</td>\n",
+       "      <td>2022/2023</td>\n",
+       "      <td>2023-03-13 14:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>98793</th>\n",
-       "      <td>766366</td>\n",
-       "      <td>139</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>176788.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>875</td>\n",
+       "      <th>492313</th>\n",
+       "      <td>3252236</td>\n",
+       "      <td>621716</td>\n",
+       "      <td>710062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>guichet</td>\n",
+       "      <td>2023-03-09 12:08:45+01:00</td>\n",
+       "      <td>7.0</td>\n",
        "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2</td>\n",
+       "      <td>théâtre</td>\n",
+       "      <td>cap nort</td>\n",
+       "      <td>tarif sco co 1 seance scolaire</td>\n",
+       "      <td>sur moi, le temps</td>\n",
+       "      <td>2022/2023</td>\n",
+       "      <td>2023-03-13 14:00:00+01:00</td>\n",
+       "      <td>1901-01-01 00:09:21+00:09</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>98794 rows × 22 columns</p>\n",
+       "<p>492314 rows × 16 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       customer_id  street_id  structure_id  mcp_contact_id  fidelity  \\\n",
-       "0           821538        139           NaN             NaN         0   \n",
-       "1           809126       1063           NaN             NaN         0   \n",
-       "2            11005       1063           NaN             NaN         0   \n",
-       "3            17663      12731           NaN             NaN         0   \n",
-       "4            38100      12395           NaN             NaN         0   \n",
-       "...            ...        ...           ...             ...       ...   \n",
-       "98789       766266        139           NaN        181304.0         0   \n",
-       "98790       766336        139           NaN        178189.0         0   \n",
-       "98791       766348        139           NaN        178141.0         0   \n",
-       "98792       766363        139           NaN        176807.0         0   \n",
-       "98793       766366        139           NaN        176788.0         0   \n",
+       "        ticket_id  customer_id  purchase_id  event_type_id supplier_name  \\\n",
+       "0         1799177        36984       409613              2       guichet   \n",
+       "1         1799178        36984       409613              3       guichet   \n",
+       "2         1799179        36984       409613              1       guichet   \n",
+       "3         1799180        36984       409613              1       guichet   \n",
+       "4         1799181        36984       409613              3       guichet   \n",
+       "...           ...          ...          ...            ...           ...   \n",
+       "492309    3252232       621716       710062              1       guichet   \n",
+       "492310    3252233       621716       710062              1       guichet   \n",
+       "492311    3252234       621716       710062              1       guichet   \n",
+       "492312    3252235       621716       710062              1       guichet   \n",
+       "492313    3252236       621716       710062              1       guichet   \n",
        "\n",
-       "       tenant_id  is_partner  deleted_at  gender  is_email_true  ...  \\\n",
-       "0            875       False         NaN       2           True  ...   \n",
-       "1            875       False         NaN       2           True  ...   \n",
-       "2            875       False         NaN       2          False  ...   \n",
-       "3            875       False         NaN       0          False  ...   \n",
-       "4            875       False         NaN       0           True  ...   \n",
-       "...          ...         ...         ...     ...            ...  ...   \n",
-       "98789        875       False         NaN       2           True  ...   \n",
-       "98790        875       False         NaN       2           True  ...   \n",
-       "98791        875       False         NaN       2           True  ...   \n",
-       "98792        875       False         NaN       2           True  ...   \n",
-       "98793        875       False         NaN       2           True  ...   \n",
+       "                    purchase_date  amount  is_full_price name_event_types  \\\n",
+       "0       2016-04-28 17:58:26+02:00     9.0          False            danse   \n",
+       "1       2016-04-28 17:58:26+02:00     9.0          False           cirque   \n",
+       "2       2016-04-28 17:58:26+02:00     9.0          False          théâtre   \n",
+       "3       2016-04-28 17:58:26+02:00     9.0          False          théâtre   \n",
+       "4       2016-04-28 17:58:26+02:00    12.0          False           cirque   \n",
+       "...                           ...     ...            ...              ...   \n",
+       "492309  2023-03-09 12:08:45+01:00     7.0          False          théâtre   \n",
+       "492310  2023-03-09 12:08:45+01:00     7.0          False          théâtre   \n",
+       "492311  2023-03-09 12:08:45+01:00     7.0          False          théâtre   \n",
+       "492312  2023-03-09 12:08:45+01:00     7.0          False          théâtre   \n",
+       "492313  2023-03-09 12:08:45+01:00     7.0          False          théâtre   \n",
        "\n",
-       "       max_price ticket_sum  average_price  average_purchase_delay  \\\n",
-       "0            NaN          0            NaN                     NaN   \n",
-       "1            NaN          0            NaN                     NaN   \n",
-       "2            NaN          0            0.0                     NaN   \n",
-       "3            NaN          0            0.0                     NaN   \n",
-       "4            NaN          0            0.0                     NaN   \n",
-       "...          ...        ...            ...                     ...   \n",
-       "98789        NaN          0            NaN                     NaN   \n",
-       "98790        NaN          0            NaN                     NaN   \n",
-       "98791        NaN          0            NaN                     NaN   \n",
-       "98792        NaN          0            NaN                     NaN   \n",
-       "98793        NaN          0            NaN                     NaN   \n",
+       "            name_facilities                 name_categories  \\\n",
+       "0                le grand t            abo t gourmand jeune   \n",
+       "1                le grand t            abo t gourmand jeune   \n",
+       "2                le grand t            abo t gourmand jeune   \n",
+       "3                le grand t            abo t gourmand jeune   \n",
+       "4       la cite des congres            abo t gourmand jeune   \n",
+       "...                     ...                             ...   \n",
+       "492309             cap nort  tarif sco co 1 seance scolaire   \n",
+       "492310             cap nort  tarif sco co 1 seance scolaire   \n",
+       "492311             cap nort  tarif sco co 1 seance scolaire   \n",
+       "492312             cap nort  tarif sco co 1 seance scolaire   \n",
+       "492313             cap nort  tarif sco co 1 seance scolaire   \n",
        "\n",
-       "       average_price_basket  average_ticket_basket  total_price  \\\n",
-       "0                       NaN                    NaN          0.0   \n",
-       "1                       NaN                    NaN          0.0   \n",
-       "2                       NaN                    NaN          NaN   \n",
-       "3                       NaN                    NaN          NaN   \n",
-       "4                       NaN                    NaN          NaN   \n",
-       "...                     ...                    ...          ...   \n",
-       "98789                   NaN                    NaN          0.0   \n",
-       "98790                   NaN                    NaN          0.0   \n",
-       "98791                   NaN                    NaN          0.0   \n",
-       "98792                   NaN                    NaN          0.0   \n",
-       "98793                   NaN                    NaN          0.0   \n",
+       "              name_events    name_seasons            start_date_time  \\\n",
+       "0            aringa rossa  test 2016/2017  2016-09-27 00:00:00+02:00   \n",
+       "1          5èmes hurlants  test 2016/2017  2016-11-18 00:00:00+01:00   \n",
+       "2                dom juan  test 2016/2017  2016-12-07 00:00:00+01:00   \n",
+       "3         vanishing point  test 2016/2017  2017-01-04 00:00:00+01:00   \n",
+       "4            a o lang pho  test 2016/2017  2017-01-03 00:00:00+01:00   \n",
+       "...                   ...             ...                        ...   \n",
+       "492309  sur moi, le temps       2022/2023  2023-03-13 14:00:00+01:00   \n",
+       "492310  sur moi, le temps       2022/2023  2023-03-13 14:00:00+01:00   \n",
+       "492311  sur moi, le temps       2022/2023  2023-03-13 14:00:00+01:00   \n",
+       "492312  sur moi, le temps       2022/2023  2023-03-13 14:00:00+01:00   \n",
+       "492313  sur moi, le temps       2022/2023  2023-03-13 14:00:00+01:00   \n",
        "\n",
-       "       purchase_count  first_buying_date  country  \n",
-       "0                   0                NaN      NaN  \n",
-       "1                   0                NaN       fr  \n",
-       "2                  14                NaN       fr  \n",
-       "3                   1                NaN       fr  \n",
-       "4                   1                NaN       fr  \n",
-       "...               ...                ...      ...  \n",
-       "98789               0                NaN      NaN  \n",
-       "98790               0                NaN      NaN  \n",
-       "98791               0                NaN      NaN  \n",
-       "98792               0                NaN      NaN  \n",
-       "98793               0                NaN      NaN  \n",
+       "                    end_date_time  open  \n",
+       "0       1901-01-01 00:09:21+00:09  True  \n",
+       "1       1901-01-01 00:09:21+00:09  True  \n",
+       "2       1901-01-01 00:09:21+00:09  True  \n",
+       "3       1901-01-01 00:09:21+00:09  True  \n",
+       "4       1901-01-01 00:09:21+00:09  True  \n",
+       "...                           ...   ...  \n",
+       "492309  1901-01-01 00:09:21+00:09  True  \n",
+       "492310  1901-01-01 00:09:21+00:09  True  \n",
+       "492311  1901-01-01 00:09:21+00:09  True  \n",
+       "492312  1901-01-01 00:09:21+00:09  True  \n",
+       "492313  1901-01-01 00:09:21+00:09  True  \n",
        "\n",
-       "[98794 rows x 22 columns]"
+       "[492314 rows x 16 columns]"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 71,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "customerplus_cleaned"
+    "products_purchased_reduced"
    ]
   },
   {
@@ -842,9 +782,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 73,
    "id": "df124880-1e4f-4eaf-b0ef-72bb4f840d45",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "customer_id              0\n",
+       "nb_tickets               0\n",
+       "nb_purchases             0\n",
+       "total_amount             0\n",
+       "nb_suppliers             0\n",
+       "vente_internet_max       0\n",
+       "purchase_date_min        0\n",
+       "purchase_date_max        0\n",
+       "time_between_purchase    0\n",
+       "nb_tickets_internet      0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_tickets_kpi.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "id": "7e2ab67d-1cf6-41de-804e-23c14e0be7d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # KPI sur le comportement d'achat\n",
+    "    \n",
+    "df_tickets_kpi = tickets_kpi_function(tickets_information = purchases)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "7be68aa3-16de-4319-93d4-0c28258e3dd8",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1071,7 +1054,7 @@
        "[26105 rows x 10 columns]"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 77,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1084,7 +1067,9 @@
    "cell_type": "code",
    "execution_count": 57,
    "id": "4e8c0d75-117f-4400-8d55-b3ae3f43501b",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
@@ -1468,6 +1453,55 @@
     "df_customerplus_clean"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "59e3a6f5-97e6-48c6-b3f8-4333a0d94eb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "customer_id                   0\n",
+       "street_id                     0\n",
+       "structure_id              96706\n",
+       "mcp_contact_id            19094\n",
+       "fidelity                      0\n",
+       "tenant_id                     0\n",
+       "is_partner                    0\n",
+       "deleted_at                98794\n",
+       "gender                        0\n",
+       "is_email_true                 0\n",
+       "opt_in                        0\n",
+       "last_buying_date          73081\n",
+       "max_price                 73081\n",
+       "ticket_sum                    0\n",
+       "average_price             35539\n",
+       "average_purchase_delay    73081\n",
+       "average_price_basket      73081\n",
+       "average_ticket_basket     73081\n",
+       "total_price               37542\n",
+       "purchase_count                0\n",
+       "first_buying_date         73081\n",
+       "country                   44192\n",
+       "gender_label                  0\n",
+       "gender_female                 0\n",
+       "gender_male                   0\n",
+       "gender_other                  0\n",
+       "country_fr                44192\n",
+       "has_tags                      0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_customerplus_clean.isna().sum()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 29,
@@ -1511,7 +1545,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 65,
    "id": "17b89ca1-deea-4139-a6c0-7822cc4e7a90",
    "metadata": {},
    "outputs": [
@@ -1667,7 +1701,7 @@
        "[69258 rows x 5 columns]"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1678,27 +1712,779 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "c90d94ab-cf0e-4d18-9d5e-cb1d22f4d58b",
+   "execution_count": 64,
+   "id": "27a3c2bf-0541-43b4-b62d-4621692f6c66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.reset_option('display.max_rows',70000)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "51e57220-021f-4b0f-a2c9-360d612c9f75",
    "metadata": {},
    "outputs": [
     {
-     "ename": "SyntaxError",
-     "evalue": "f-string: expecting '}' (1665996669.py, line 1)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  Cell \u001b[0;32mIn[40], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m    BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{'musee'}'\u001b[0m\n\u001b[0m                                                         ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m f-string: expecting '}'\n"
+     "data": {
+      "text/plain": [
+       "0       Newsletter mensuelle\n",
+       "1       Newsletter mensuelle\n",
+       "2       Newsletter mensuelle\n",
+       "3       Newsletter mensuelle\n",
+       "4       Newsletter mensuelle\n",
+       "                ...         \n",
+       "9995    Newsletter mensuelle\n",
+       "9996    Newsletter mensuelle\n",
+       "9997    Newsletter mensuelle\n",
+       "9998    Newsletter mensuelle\n",
+       "9999    Newsletter mensuelle\n",
+       "Name: target_name, Length: 10000, dtype: object"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "targets[\"target_name\"].head(10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "db3748e6-795e-459c-86dd-3389455af217",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
+    "            'sport': ['5', '6', '7', '8', '9'],\n",
+    "            'musique' : ['10', '11', '12', '13', '14']}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "d6767ba6-94ef-43f9-8f67-15ecdb41a70b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
      ]
     }
    ],
    "source": [
-    "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{'musee'}'"
+    "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
+    "list_of_comp = companies[type_of_comp] \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "050963aa-5cdc-4ff2-a380-16efec89adf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dossier d'exportation\n",
+    "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "21a32b69-de53-45ce-9e31-22c45c223924",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'projet-bdc2324-team1/Generalization/musique'"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "BUCKET_OUT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "177c4742-5ec6-4326-b984-09e673791801",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'projet-bdc2324-team1/Generalization/musique'"
+      ]
+     },
+     "execution_count": 96,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'projet-bdc2324-team1/Generalization/musique'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "80c6d397-117e-493d-ab0f-7698dbfa8cc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_covering_time(df, company, datecover):\n",
+    "    \"\"\"\n",
+    "    This function draws the time coverage of each company\n",
+    "    \"\"\"\n",
+    "    min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
+    "    max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
+    "    datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
+    "    print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
+    "    return datecover\n",
+    "\n",
+    "\n",
+    "def compute_time_intersection(datecover):\n",
+    "    \"\"\"\n",
+    "    This function returns the time coverage for all companies\n",
+    "    \"\"\"\n",
+    "    timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
+    "    intersection = set.intersection(*timestamps_sets)\n",
+    "    intersection_list = list(intersection)\n",
+    "    formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
+    "    return sorted(formated_dates)\n",
+    "\n",
+    "\n",
+    "def df_coverage_modelization(sport, coverage_train = 0.7):\n",
+    "    \"\"\"\n",
+    "    This function returns start_date, end_of_features and final dates\n",
+    "    that help to construct train and test datasets\n",
+    "    \"\"\"\n",
+    "    datecover = {}\n",
+    "    for company in sport:\n",
+    "        df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
+    "                                                          datetime_col = ['purchase_date'])\n",
+    "        datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
+    "    #print(datecover.keys())\n",
+    "    dt_coverage = compute_time_intersection(datecover)\n",
+    "    start_date = dt_coverage[0]\n",
+    "    end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
+    "    final_date = dt_coverage[-1]\n",
+    "    return start_date, end_of_features, final_date\n",
+    "    \n",
+    "\n",
+    "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
+    "    \n",
+    "    # Import customerplus\n",
+    "    df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
+    "    df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
+    "    df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
+    "    \n",
+    "    # Filtre de cohérence pour la mise en pratique de notre méthode\n",
+    "    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
+    "    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
+    "    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
+    "\n",
+    "    #Filtre de la base df_campaigns_information\n",
+    "    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
+    "    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+    "    \n",
+    "    #Filtre de la base df_products_purchased_reduced\n",
+    "    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
+    "\n",
+    "    print(\"Data filtering : SUCCESS\")\n",
+    "    \n",
+    "    # Fusion de l'ensemble et creation des KPI\n",
+    "\n",
+    "    # KPI sur les campagnes publicitaires\n",
+    "    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
+    "\n",
+    "    # KPI sur le comportement d'achat\n",
+    "    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
+    "\n",
+    "    # KPI sur les données socio-démographiques\n",
+    "    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
+    "    \n",
+    "    print(\"KPIs construction : SUCCESS\")\n",
+    "    \n",
+    "    # Fusion avec KPI liés au customer\n",
+    "    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
+    "    \n",
+    "    # Fill NaN values\n",
+    "    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
+    "    \n",
+    "    # Fusion avec KPI liés au comportement d'achat\n",
+    "    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
+    "    \n",
+    "    # Fill NaN values\n",
+    "    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
+    "\n",
+    "    print(\"Explanatory variable construction : SUCCESS\")\n",
+    "\n",
+    "    # 2. Construction of the explained variable \n",
+    "    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
+    "\n",
+    "    # Indicatrice d'achat\n",
+    "    df_products_purchased_to_predict['y_has_purchased'] = 1\n",
+    "\n",
+    "    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
+    "\n",
+    "    print(\"Explained variable construction : SUCCESS\")\n",
+    "    \n",
+    "    # 3. Merge between explained and explanatory variables\n",
+    "    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
+    "\n",
+    "    # 0 if there is no purchase\n",
+    "    dataset[['y_has_purchased']].fillna(0)    \n",
+    "    \n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "id": "2a746097-0cbf-4bd6-b13b-6ee3e5c36fad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "<string>:13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "<string>:13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
+      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
+      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
+      "\n",
+      "df[\"col\"][row_indexer] = value\n",
+      "\n",
+      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data filtering : SUCCESS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KPIs construction : SUCCESS\n",
+      "Explanatory variable construction : SUCCESS\n",
+      "Explained variable construction : SUCCESS\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
+      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
+      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
+      "\n",
+      "df[\"col\"][row_indexer] = value\n",
+      "\n",
+      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data filtering : SUCCESS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KPIs construction : SUCCESS\n",
+      "Explanatory variable construction : SUCCESS\n",
+      "Explained variable construction : SUCCESS\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "<string>:13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
+      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
+      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
+      "\n",
+      "df[\"col\"][row_indexer] = value\n",
+      "\n",
+      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+      "<string>:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data filtering : SUCCESS\n",
+      "KPIs construction : SUCCESS\n",
+      "Explanatory variable construction : SUCCESS\n",
+      "Explained variable construction : SUCCESS\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
+      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
+      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
+      "\n",
+      "df[\"col\"][row_indexer] = value\n",
+      "\n",
+      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data filtering : SUCCESS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KPIs construction : SUCCESS\n",
+      "Explanatory variable construction : SUCCESS\n",
+      "Explained variable construction : SUCCESS\n",
+      "File path :  projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
+      "<string>:13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n",
+      "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n",
+      "A typical example is when you are setting values in a column of a DataFrame, like:\n",
+      "\n",
+      "df[\"col\"][row_indexer] = value\n",
+      "\n",
+      "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
+      "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n",
+      "  df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data filtering : SUCCESS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KPIs construction : SUCCESS\n",
+      "Explanatory variable construction : SUCCESS\n",
+      "Explained variable construction : SUCCESS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create test dataset and train dataset for sport companies\n",
+    "\n",
+    "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
+    "\n",
+    "for company in list_of_comp:\n",
+    "    dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
+    "                                        max_date = final_date, directory_path = company) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "id": "01900e04-61e7-4a1b-8c9c-b72e42ba9507",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Exportation dataset test : SUCCESS\n"
+     ]
+    }
+   ],
+   "source": [
+    "  # Exportation\n",
+    "FILE_KEY_OUT_S3 = \"dataset_test\" + company +  \".csv\"\n",
+    "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
+    "    \n",
+    "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
+    "    dataset_test.to_csv(file_out, index = False)\n",
+    "    \n",
+    "print(\"Exportation dataset test : SUCCESS\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "b0de2e18-edff-416c-b623-e3e23016029d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'projet-bdc2324-team1/Generalization/musique/dataset_test14.csv'"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FILE_PATH_OUT_S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "id": "8f56d6ee-82c9-43e2-813d-33d6aaa458dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'dataset_test14' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[105], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdataset_test14\u001b[49m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'dataset_test14' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "dataset_test14"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d6767ba6-94ef-43f9-8f67-15ecdb41a70b",
+   "id": "9232a8df-c51a-4f10-9fc8-ce4f8ad8aab4",
    "metadata": {},
    "outputs": [],
    "source": []