Merge remote-tracking branch 'origin/KPI_construction' into events_theme

2024-02-08 09:42:37 +00:00 · 2024-02-08 09:42:37 +00:00 · 07be599d5e
commit 07be599d5e
parent fe049764cb 9b7d6a5785
2 changed files with 1590 additions and 506 deletions
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
@ -143,7 +143,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 6,
   "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
   "metadata": {},
   "outputs": [
@ -151,7 +151,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n"
     ]
    }
@ -2731,7 +2731,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 8,
   "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
   "metadata": {},
   "outputs": [
@ -2739,7 +2739,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n",
+      "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -2795,11 +2795,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
-   "id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d",
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 10,
+   "id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c",
+   "metadata": {},
   "outputs": [
    {
     "data": {
@ -2831,226 +2829,42 @@
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
-       "      <th>8793</th>\n",
-       "      <td>4584599</td>\n",
-       "      <td>1</td>\n",
-       "      <td>consentement optin jeune public</td>\n",
+       "      <th>0</th>\n",
+       "      <td>1184824</td>\n",
+       "      <td>645400</td>\n",
+       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>13249</th>\n",
-       "      <td>4567465</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP rentrée culturelle 2023</td>\n",
+       "      <th>1</th>\n",
+       "      <td>210571</td>\n",
+       "      <td>2412</td>\n",
+       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>21424</th>\n",
-       "      <td>4544805</td>\n",
-       "      <td>1</td>\n",
-       "      <td>spectateurs cine dimanche_cine concert_2122</td>\n",
+       "      <th>2</th>\n",
+       "      <td>210572</td>\n",
+       "      <td>4536</td>\n",
+       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>21665</th>\n",
-       "      <td>4544911</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP Cine 2023</td>\n",
+       "      <th>3</th>\n",
+       "      <td>210573</td>\n",
+       "      <td>6736</td>\n",
+       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>22811</th>\n",
-       "      <td>4545766</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP OLBJ! 2023</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>57305</th>\n",
-       "      <td>4457909</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp_promo_visiteurs occasionnels_musee_8mois</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_dynamic_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>58843</th>\n",
-       "      <td>3688872</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP promo livemag</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>66813</th>\n",
-       "      <td>4313646</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP spectateurs Classique mais pas que 2022</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>68367</th>\n",
-       "      <td>4547662</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp_promo_musee_au moins 3 achats_dps8mois</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_dynamic_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>77320</th>\n",
-       "      <td>4285520</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP spectateurs Iminente</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>84350</th>\n",
-       "      <td>4037805</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP spectateurs Marseille Jazz 18-19-21</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>85383</th>\n",
-       "      <td>4569504</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP rendez-vous de septembre offre spéciale</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>92868</th>\n",
-       "      <td>4433064</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp_promo_plein air_ateliers_jardins</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>99670</th>\n",
-       "      <td>3858684</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Acid Arab</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>105477</th>\n",
-       "      <td>4321810</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Arenametrix_bascule tel vers sib</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>169513</th>\n",
-       "      <td>3697992</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp_achats billets nb dps 19052021</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>214421</th>\n",
-       "      <td>2925324</td>\n",
-       "      <td>1</td>\n",
-       "      <td>consentement optout scolaires</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>234546</th>\n",
-       "      <td>4575957</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Portrait de Leila shahid</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>259808</th>\n",
-       "      <td>3722259</td>\n",
-       "      <td>1</td>\n",
-       "      <td>consentement optin b2b</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>274380</th>\n",
-       "      <td>4510423</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP_marseille_jazz_2023</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>307511</th>\n",
-       "      <td>5174466</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp actoral 21-22</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>357509</th>\n",
-       "      <td>4442526</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp musique barvalo</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>392920</th>\n",
-       "      <td>4390642</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ddcp_md_promo_spectateurs theatre contempo</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>449620</th>\n",
-       "      <td>4411897</td>\n",
-       "      <td>1</td>\n",
-       "      <td>FORMATION _ acheteurs optin last year</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_dynamic_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>503809</th>\n",
-       "      <td>4734591</td>\n",
-       "      <td>1</td>\n",
-       "      <td>consentement optin mediation specialisee</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>651222</th>\n",
-       "      <td>3554426</td>\n",
-       "      <td>1</td>\n",
-       "      <td>consentement optin b2c</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>654246</th>\n",
-       "      <td>5182212</td>\n",
-       "      <td>1</td>\n",
-       "      <td>DDCP spectateurs Festival de Marseille 2023</td>\n",
-       "      <td>False</td>\n",
-       "      <td>manual_static_filter</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>654395</th>\n",
-       "      <td>5182456</td>\n",
-       "      <td>1</td>\n",
-       "      <td>rencontres_echelle_spectateurs_2021_2023</td>\n",
+       "      <th>4</th>\n",
+       "      <td>210574</td>\n",
+       "      <td>38210</td>\n",
+       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
@ -3059,80 +2873,241 @@
       "</div>"
      ],
      "text/plain": [
-       "             id  customer_id                                    target_name  \\\n",
-       "8793    4584599            1                consentement optin jeune public   \n",
-       "13249   4567465            1                   DDCP rentrée culturelle 2023   \n",
-       "21424   4544805            1    spectateurs cine dimanche_cine concert_2122   \n",
-       "21665   4544911            1                                 DDCP Cine 2023   \n",
-       "22811   4545766            1                                DDCP OLBJ! 2023   \n",
-       "57305   4457909            1  ddcp_promo_visiteurs occasionnels_musee_8mois   \n",
-       "58843   3688872            1                             DDCP promo livemag   \n",
-       "66813   4313646            1   DDCP spectateurs Classique mais pas que 2022   \n",
-       "68367   4547662            1    ddcp_promo_musee_au moins 3 achats_dps8mois   \n",
-       "77320   4285520            1                      DDCP spectateurs Iminente   \n",
-       "84350   4037805            1       DDCP spectateurs Marseille Jazz 18-19-21   \n",
-       "85383   4569504            1   DDCP rendez-vous de septembre offre spéciale   \n",
-       "92868   4433064            1          ddcp_promo_plein air_ateliers_jardins   \n",
-       "99670   3858684            1                                      Acid Arab   \n",
-       "105477  4321810            1               Arenametrix_bascule tel vers sib   \n",
-       "169513  3697992            1            ddcp_achats billets nb dps 19052021   \n",
-       "214421  2925324            1                  consentement optout scolaires   \n",
-       "234546  4575957            1                       Portrait de Leila shahid   \n",
-       "259808  3722259            1                         consentement optin b2b   \n",
-       "274380  4510423            1                       DDCP_marseille_jazz_2023   \n",
-       "307511  5174466            1                             ddcp actoral 21-22   \n",
-       "357509  4442526            1                           ddcp musique barvalo   \n",
-       "392920  4390642            1     ddcp_md_promo_spectateurs theatre contempo   \n",
-       "449620  4411897            1          FORMATION _ acheteurs optin last year   \n",
-       "503809  4734591            1       consentement optin mediation specialisee   \n",
-       "651222  3554426            1                         consentement optin b2c   \n",
-       "654246  5182212            1    DDCP spectateurs Festival de Marseille 2023   \n",
-       "654395  5182456            1       rencontres_echelle_spectateurs_2021_2023   \n",
+       "        id  customer_id               target_name  target_type_is_import  \\\n",
+       "0  1184824       645400  DDCP PROMO Réseau livres                  False   \n",
+       "1   210571         2412  DDCP PROMO Réseau livres                  False   \n",
+       "2   210572         4536  DDCP PROMO Réseau livres                  False   \n",
+       "3   210573         6736  DDCP PROMO Réseau livres                  False   \n",
+       "4   210574        38210  DDCP PROMO Réseau livres                  False   \n",
       "\n",
-       "        target_type_is_import       target_type_name  \n",
-       "8793                    False   manual_static_filter  \n",
-       "13249                   False   manual_static_filter  \n",
-       "21424                   False   manual_static_filter  \n",
-       "21665                   False   manual_static_filter  \n",
-       "22811                   False   manual_static_filter  \n",
-       "57305                   False  manual_dynamic_filter  \n",
-       "58843                   False   manual_static_filter  \n",
-       "66813                   False   manual_static_filter  \n",
-       "68367                   False  manual_dynamic_filter  \n",
-       "77320                   False   manual_static_filter  \n",
-       "84350                   False   manual_static_filter  \n",
-       "85383                   False   manual_static_filter  \n",
-       "92868                   False   manual_static_filter  \n",
-       "99670                   False   manual_static_filter  \n",
-       "105477                  False   manual_static_filter  \n",
-       "169513                  False   manual_static_filter  \n",
-       "214421                  False   manual_static_filter  \n",
-       "234546                  False   manual_static_filter  \n",
-       "259808                  False   manual_static_filter  \n",
-       "274380                  False   manual_static_filter  \n",
-       "307511                  False   manual_static_filter  \n",
-       "357509                  False   manual_static_filter  \n",
-       "392920                  False   manual_static_filter  \n",
-       "449620                  False  manual_dynamic_filter  \n",
-       "503809                  False   manual_static_filter  \n",
-       "651222                  False   manual_static_filter  \n",
-       "654246                  False   manual_static_filter  \n",
-       "654395                  False   manual_static_filter  "
+       "       target_type_name  \n",
+       "0  manual_static_filter  \n",
+       "1  manual_static_filter  \n",
+       "2  manual_static_filter  \n",
+       "3  manual_static_filter  \n",
+       "4  manual_static_filter  "
      ]
     },
-     "execution_count": 57,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df1_targets_full[df1_targets_full['customer_id'] == 1]"
+    "df1_targets_full.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Catégorisation des target_name\n",
+    "import pandas as pd\n",
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.probability import FreqDist\n",
+    "\n",
+    "# Téléchargement des ressources nécessaires\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mots les plus fréquents:\n",
+      "consentement: 550777\n",
+      "optin: 463579\n",
+      "jeune: 155103\n",
+      "public: 155103\n",
+      "mediation: 150001\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
+    "def preprocess_text(texte):\n",
+    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
+    "    texte_concat = ' '.join(texte)\n",
+    "    \n",
+    "    # Tokenisation des mots\n",
+    "    tokens = word_tokenize(texte_concat.lower())\n",
+    "    \n",
+    "    # Suppression des mots vides (stopwords)\n",
+    "    stop_words = set(stopwords.words('french'))\n",
+    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
+    "    \n",
+    "    # Lemmatisation des mots\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
+    "    \n",
+    "    return lemmatized_tokens\n",
+    "\n",
+    "\n",
+    "# Appliquer le prétraitement à la colonne de texte\n",
+    "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
+    "\n",
+    "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
+    "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
+    "\n",
+    "# Calculer la fréquence des mots\n",
+    "freq_dist = FreqDist(all_words)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mots les plus fréquents:\n",
+      "consentement: 550777\n",
+      "optin: 463579\n",
+      "jeune: 155103\n",
+      "public: 155103\n",
+      "mediation: 150001\n",
+      "specialisee: 150001\n",
+      "b2c: 143432\n",
+      "optout: 97683\n",
+      "newsletter: 56022\n",
+      "(: 46084\n",
+      "): 46084\n",
+      "inscrits: 42296\n",
+      "nl: 42294\n",
+      "générale: 41037\n",
+      "generale: 40950\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Affichage des mots les plus fréquents\n",
+    "print(\"Mots les plus fréquents:\")\n",
+    "for mot, freq in freq_dist.most_common(15):\n",
+    "    print(f\"{mot}: {freq}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "8bffef87-542e-4775-bc7c-2c0323fda581",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                            texte  \\\n",
+      "0  Le chat noir mange une souris.   \n",
+      "1           Le chien blanc aboie.   \n",
+      "\n",
+      "                                 texte_preprocessed  \n",
+      "0  [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .]  \n",
+      "1              [e, h, i, e, b, a, a, b, o, i, e, .]  \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "\n",
+    "# Téléchargement des ressources nécessaires\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "\n",
+    "# Création de la DataFrame d'exemple\n",
+    "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "# Fonction pour prétraiter le texte\n",
+    "def preprocess_text(texte):\n",
+    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
+    "    texte_concat = ' '.join(texte)\n",
+    "    \n",
+    "    # Tokenisation des mots\n",
+    "    tokens = word_tokenize(texte_concat.lower())\n",
+    "    \n",
+    "    # Suppression des mots vides (stopwords)\n",
+    "    stop_words = set(stopwords.words('french'))\n",
+    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
+    "    \n",
+    "    # Lemmatisation des mots\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
+    "    \n",
+    "    return lemmatized_tokens\n",
+    "\n",
+    "# Appliquer la fonction de prétraitement à la colonne de texte\n",
+    "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
+    "\n",
+    "# Afficher le résultat\n",
+    "print(df)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f665824-a026-4acd-8358-b408a61854b4",
-   "metadata": {},
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
   "source": [
    "## Campaign area"
   ]
@ -3902,9 +3877,7 @@
  {
   "cell_type": "markdown",
   "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
   "source": [
    "## Exploration variables"
   ]