Merge remote-tracking branch 'origin/KPI_construction' into events_theme

This commit is contained in:
Alexis REVELLE 2024-02-08 09:42:37 +00:00
commit 07be599d5e
2 changed files with 1590 additions and 506 deletions

File diff suppressed because it is too large Load Diff

View File

@ -143,7 +143,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 6,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
"outputs": [
@ -151,7 +151,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n"
]
}
@ -2731,7 +2731,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 8,
"id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
"metadata": {},
"outputs": [
@ -2739,7 +2739,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n",
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -2795,11 +2795,9 @@
},
{
"cell_type": "code",
"execution_count": 57,
"id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d",
"metadata": {
"scrolled": true
},
"execution_count": 10,
"id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c",
"metadata": {},
"outputs": [
{
"data": {
@ -2831,226 +2829,42 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8793</th>\n",
" <td>4584599</td>\n",
" <td>1</td>\n",
" <td>consentement optin jeune public</td>\n",
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13249</th>\n",
" <td>4567465</td>\n",
" <td>1</td>\n",
" <td>DDCP rentrée culturelle 2023</td>\n",
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21424</th>\n",
" <td>4544805</td>\n",
" <td>1</td>\n",
" <td>spectateurs cine dimanche_cine concert_2122</td>\n",
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21665</th>\n",
" <td>4544911</td>\n",
" <td>1</td>\n",
" <td>DDCP Cine 2023</td>\n",
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22811</th>\n",
" <td>4545766</td>\n",
" <td>1</td>\n",
" <td>DDCP OLBJ! 2023</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57305</th>\n",
" <td>4457909</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_visiteurs occasionnels_musee_8mois</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58843</th>\n",
" <td>3688872</td>\n",
" <td>1</td>\n",
" <td>DDCP promo livemag</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66813</th>\n",
" <td>4313646</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Classique mais pas que 2022</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68367</th>\n",
" <td>4547662</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_musee_au moins 3 achats_dps8mois</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77320</th>\n",
" <td>4285520</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Iminente</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84350</th>\n",
" <td>4037805</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Marseille Jazz 18-19-21</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85383</th>\n",
" <td>4569504</td>\n",
" <td>1</td>\n",
" <td>DDCP rendez-vous de septembre offre spéciale</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92868</th>\n",
" <td>4433064</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_plein air_ateliers_jardins</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99670</th>\n",
" <td>3858684</td>\n",
" <td>1</td>\n",
" <td>Acid Arab</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105477</th>\n",
" <td>4321810</td>\n",
" <td>1</td>\n",
" <td>Arenametrix_bascule tel vers sib</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>169513</th>\n",
" <td>3697992</td>\n",
" <td>1</td>\n",
" <td>ddcp_achats billets nb dps 19052021</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>214421</th>\n",
" <td>2925324</td>\n",
" <td>1</td>\n",
" <td>consentement optout scolaires</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234546</th>\n",
" <td>4575957</td>\n",
" <td>1</td>\n",
" <td>Portrait de Leila shahid</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259808</th>\n",
" <td>3722259</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2b</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274380</th>\n",
" <td>4510423</td>\n",
" <td>1</td>\n",
" <td>DDCP_marseille_jazz_2023</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>307511</th>\n",
" <td>5174466</td>\n",
" <td>1</td>\n",
" <td>ddcp actoral 21-22</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>357509</th>\n",
" <td>4442526</td>\n",
" <td>1</td>\n",
" <td>ddcp musique barvalo</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392920</th>\n",
" <td>4390642</td>\n",
" <td>1</td>\n",
" <td>ddcp_md_promo_spectateurs theatre contempo</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>449620</th>\n",
" <td>4411897</td>\n",
" <td>1</td>\n",
" <td>FORMATION _ acheteurs optin last year</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503809</th>\n",
" <td>4734591</td>\n",
" <td>1</td>\n",
" <td>consentement optin mediation specialisee</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>651222</th>\n",
" <td>3554426</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2c</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>654246</th>\n",
" <td>5182212</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Festival de Marseille 2023</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>654395</th>\n",
" <td>5182456</td>\n",
" <td>1</td>\n",
" <td>rencontres_echelle_spectateurs_2021_2023</td>\n",
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
@ -3059,80 +2873,241 @@
"</div>"
],
"text/plain": [
" id customer_id target_name \\\n",
"8793 4584599 1 consentement optin jeune public \n",
"13249 4567465 1 DDCP rentrée culturelle 2023 \n",
"21424 4544805 1 spectateurs cine dimanche_cine concert_2122 \n",
"21665 4544911 1 DDCP Cine 2023 \n",
"22811 4545766 1 DDCP OLBJ! 2023 \n",
"57305 4457909 1 ddcp_promo_visiteurs occasionnels_musee_8mois \n",
"58843 3688872 1 DDCP promo livemag \n",
"66813 4313646 1 DDCP spectateurs Classique mais pas que 2022 \n",
"68367 4547662 1 ddcp_promo_musee_au moins 3 achats_dps8mois \n",
"77320 4285520 1 DDCP spectateurs Iminente \n",
"84350 4037805 1 DDCP spectateurs Marseille Jazz 18-19-21 \n",
"85383 4569504 1 DDCP rendez-vous de septembre offre spéciale \n",
"92868 4433064 1 ddcp_promo_plein air_ateliers_jardins \n",
"99670 3858684 1 Acid Arab \n",
"105477 4321810 1 Arenametrix_bascule tel vers sib \n",
"169513 3697992 1 ddcp_achats billets nb dps 19052021 \n",
"214421 2925324 1 consentement optout scolaires \n",
"234546 4575957 1 Portrait de Leila shahid \n",
"259808 3722259 1 consentement optin b2b \n",
"274380 4510423 1 DDCP_marseille_jazz_2023 \n",
"307511 5174466 1 ddcp actoral 21-22 \n",
"357509 4442526 1 ddcp musique barvalo \n",
"392920 4390642 1 ddcp_md_promo_spectateurs theatre contempo \n",
"449620 4411897 1 FORMATION _ acheteurs optin last year \n",
"503809 4734591 1 consentement optin mediation specialisee \n",
"651222 3554426 1 consentement optin b2c \n",
"654246 5182212 1 DDCP spectateurs Festival de Marseille 2023 \n",
"654395 5182456 1 rencontres_echelle_spectateurs_2021_2023 \n",
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
"\n",
" target_type_is_import target_type_name \n",
"8793 False manual_static_filter \n",
"13249 False manual_static_filter \n",
"21424 False manual_static_filter \n",
"21665 False manual_static_filter \n",
"22811 False manual_static_filter \n",
"57305 False manual_dynamic_filter \n",
"58843 False manual_static_filter \n",
"66813 False manual_static_filter \n",
"68367 False manual_dynamic_filter \n",
"77320 False manual_static_filter \n",
"84350 False manual_static_filter \n",
"85383 False manual_static_filter \n",
"92868 False manual_static_filter \n",
"99670 False manual_static_filter \n",
"105477 False manual_static_filter \n",
"169513 False manual_static_filter \n",
"214421 False manual_static_filter \n",
"234546 False manual_static_filter \n",
"259808 False manual_static_filter \n",
"274380 False manual_static_filter \n",
"307511 False manual_static_filter \n",
"357509 False manual_static_filter \n",
"392920 False manual_static_filter \n",
"449620 False manual_dynamic_filter \n",
"503809 False manual_static_filter \n",
"651222 False manual_static_filter \n",
"654246 False manual_static_filter \n",
"654395 False manual_static_filter "
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
]
},
"execution_count": 57,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_targets_full[df1_targets_full['customer_id'] == 1]"
"df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
]
}
],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
]
}
],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "8bffef87-542e-4775-bc7c-2c0323fda581",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "2f665824-a026-4acd-8358-b408a61854b4",
"metadata": {},
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
@ -3902,9 +3877,7 @@
{
"cell_type": "markdown",
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"## Exploration variables"
]