Merge remote-tracking branch 'origin/KPI_construction' into events_theme

This commit is contained in:
Alexis REVELLE 2024-02-08 09:42:37 +00:00
commit 07be599d5e
2 changed files with 1590 additions and 506 deletions

File diff suppressed because it is too large Load Diff

View File

@ -143,7 +143,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 6,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -151,7 +151,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n" " df = pd.read_csv(file_in)\n"
] ]
} }
@ -2731,7 +2731,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 8,
"id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a", "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2739,7 +2739,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n", "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n", "A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n", "\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -2795,11 +2795,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 57, "execution_count": 10,
"id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d", "id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c",
"metadata": { "metadata": {},
"scrolled": true
},
"outputs": [ "outputs": [
{ {
"data": { "data": {
@ -2831,226 +2829,42 @@
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>8793</th>\n", " <th>0</th>\n",
" <td>4584599</td>\n", " <td>1184824</td>\n",
" <td>1</td>\n", " <td>645400</td>\n",
" <td>consentement optin jeune public</td>\n", " <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>manual_static_filter</td>\n", " <td>manual_static_filter</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>13249</th>\n", " <th>1</th>\n",
" <td>4567465</td>\n", " <td>210571</td>\n",
" <td>1</td>\n", " <td>2412</td>\n",
" <td>DDCP rentrée culturelle 2023</td>\n", " <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>manual_static_filter</td>\n", " <td>manual_static_filter</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>21424</th>\n", " <th>2</th>\n",
" <td>4544805</td>\n", " <td>210572</td>\n",
" <td>1</td>\n", " <td>4536</td>\n",
" <td>spectateurs cine dimanche_cine concert_2122</td>\n", " <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>manual_static_filter</td>\n", " <td>manual_static_filter</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>21665</th>\n", " <th>3</th>\n",
" <td>4544911</td>\n", " <td>210573</td>\n",
" <td>1</td>\n", " <td>6736</td>\n",
" <td>DDCP Cine 2023</td>\n", " <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>manual_static_filter</td>\n", " <td>manual_static_filter</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>22811</th>\n", " <th>4</th>\n",
" <td>4545766</td>\n", " <td>210574</td>\n",
" <td>1</td>\n", " <td>38210</td>\n",
" <td>DDCP OLBJ! 2023</td>\n", " <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57305</th>\n",
" <td>4457909</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_visiteurs occasionnels_musee_8mois</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58843</th>\n",
" <td>3688872</td>\n",
" <td>1</td>\n",
" <td>DDCP promo livemag</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66813</th>\n",
" <td>4313646</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Classique mais pas que 2022</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68367</th>\n",
" <td>4547662</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_musee_au moins 3 achats_dps8mois</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77320</th>\n",
" <td>4285520</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Iminente</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84350</th>\n",
" <td>4037805</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Marseille Jazz 18-19-21</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85383</th>\n",
" <td>4569504</td>\n",
" <td>1</td>\n",
" <td>DDCP rendez-vous de septembre offre spéciale</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92868</th>\n",
" <td>4433064</td>\n",
" <td>1</td>\n",
" <td>ddcp_promo_plein air_ateliers_jardins</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99670</th>\n",
" <td>3858684</td>\n",
" <td>1</td>\n",
" <td>Acid Arab</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105477</th>\n",
" <td>4321810</td>\n",
" <td>1</td>\n",
" <td>Arenametrix_bascule tel vers sib</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>169513</th>\n",
" <td>3697992</td>\n",
" <td>1</td>\n",
" <td>ddcp_achats billets nb dps 19052021</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>214421</th>\n",
" <td>2925324</td>\n",
" <td>1</td>\n",
" <td>consentement optout scolaires</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234546</th>\n",
" <td>4575957</td>\n",
" <td>1</td>\n",
" <td>Portrait de Leila shahid</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259808</th>\n",
" <td>3722259</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2b</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274380</th>\n",
" <td>4510423</td>\n",
" <td>1</td>\n",
" <td>DDCP_marseille_jazz_2023</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>307511</th>\n",
" <td>5174466</td>\n",
" <td>1</td>\n",
" <td>ddcp actoral 21-22</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>357509</th>\n",
" <td>4442526</td>\n",
" <td>1</td>\n",
" <td>ddcp musique barvalo</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392920</th>\n",
" <td>4390642</td>\n",
" <td>1</td>\n",
" <td>ddcp_md_promo_spectateurs theatre contempo</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>449620</th>\n",
" <td>4411897</td>\n",
" <td>1</td>\n",
" <td>FORMATION _ acheteurs optin last year</td>\n",
" <td>False</td>\n",
" <td>manual_dynamic_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503809</th>\n",
" <td>4734591</td>\n",
" <td>1</td>\n",
" <td>consentement optin mediation specialisee</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>651222</th>\n",
" <td>3554426</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2c</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>654246</th>\n",
" <td>5182212</td>\n",
" <td>1</td>\n",
" <td>DDCP spectateurs Festival de Marseille 2023</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>654395</th>\n",
" <td>5182456</td>\n",
" <td>1</td>\n",
" <td>rencontres_echelle_spectateurs_2021_2023</td>\n",
" <td>False</td>\n", " <td>False</td>\n",
" <td>manual_static_filter</td>\n", " <td>manual_static_filter</td>\n",
" </tr>\n", " </tr>\n",
@ -3059,80 +2873,241 @@
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" id customer_id target_name \\\n", " id customer_id target_name target_type_is_import \\\n",
"8793 4584599 1 consentement optin jeune public \n", "0 1184824 645400 DDCP PROMO Réseau livres False \n",
"13249 4567465 1 DDCP rentrée culturelle 2023 \n", "1 210571 2412 DDCP PROMO Réseau livres False \n",
"21424 4544805 1 spectateurs cine dimanche_cine concert_2122 \n", "2 210572 4536 DDCP PROMO Réseau livres False \n",
"21665 4544911 1 DDCP Cine 2023 \n", "3 210573 6736 DDCP PROMO Réseau livres False \n",
"22811 4545766 1 DDCP OLBJ! 2023 \n", "4 210574 38210 DDCP PROMO Réseau livres False \n",
"57305 4457909 1 ddcp_promo_visiteurs occasionnels_musee_8mois \n",
"58843 3688872 1 DDCP promo livemag \n",
"66813 4313646 1 DDCP spectateurs Classique mais pas que 2022 \n",
"68367 4547662 1 ddcp_promo_musee_au moins 3 achats_dps8mois \n",
"77320 4285520 1 DDCP spectateurs Iminente \n",
"84350 4037805 1 DDCP spectateurs Marseille Jazz 18-19-21 \n",
"85383 4569504 1 DDCP rendez-vous de septembre offre spéciale \n",
"92868 4433064 1 ddcp_promo_plein air_ateliers_jardins \n",
"99670 3858684 1 Acid Arab \n",
"105477 4321810 1 Arenametrix_bascule tel vers sib \n",
"169513 3697992 1 ddcp_achats billets nb dps 19052021 \n",
"214421 2925324 1 consentement optout scolaires \n",
"234546 4575957 1 Portrait de Leila shahid \n",
"259808 3722259 1 consentement optin b2b \n",
"274380 4510423 1 DDCP_marseille_jazz_2023 \n",
"307511 5174466 1 ddcp actoral 21-22 \n",
"357509 4442526 1 ddcp musique barvalo \n",
"392920 4390642 1 ddcp_md_promo_spectateurs theatre contempo \n",
"449620 4411897 1 FORMATION _ acheteurs optin last year \n",
"503809 4734591 1 consentement optin mediation specialisee \n",
"651222 3554426 1 consentement optin b2c \n",
"654246 5182212 1 DDCP spectateurs Festival de Marseille 2023 \n",
"654395 5182456 1 rencontres_echelle_spectateurs_2021_2023 \n",
"\n", "\n",
" target_type_is_import target_type_name \n", " target_type_name \n",
"8793 False manual_static_filter \n", "0 manual_static_filter \n",
"13249 False manual_static_filter \n", "1 manual_static_filter \n",
"21424 False manual_static_filter \n", "2 manual_static_filter \n",
"21665 False manual_static_filter \n", "3 manual_static_filter \n",
"22811 False manual_static_filter \n", "4 manual_static_filter "
"57305 False manual_dynamic_filter \n",
"58843 False manual_static_filter \n",
"66813 False manual_static_filter \n",
"68367 False manual_dynamic_filter \n",
"77320 False manual_static_filter \n",
"84350 False manual_static_filter \n",
"85383 False manual_static_filter \n",
"92868 False manual_static_filter \n",
"99670 False manual_static_filter \n",
"105477 False manual_static_filter \n",
"169513 False manual_static_filter \n",
"214421 False manual_static_filter \n",
"234546 False manual_static_filter \n",
"259808 False manual_static_filter \n",
"274380 False manual_static_filter \n",
"307511 False manual_static_filter \n",
"357509 False manual_static_filter \n",
"392920 False manual_static_filter \n",
"449620 False manual_dynamic_filter \n",
"503809 False manual_static_filter \n",
"651222 False manual_static_filter \n",
"654246 False manual_static_filter \n",
"654395 False manual_static_filter "
] ]
}, },
"execution_count": 57, "execution_count": 10,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df1_targets_full[df1_targets_full['customer_id'] == 1]" "df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
]
}
],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
]
}
],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "8bffef87-542e-4775-bc7c-2c0323fda581",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "2f665824-a026-4acd-8358-b408a61854b4", "id": "2f665824-a026-4acd-8358-b408a61854b4",
"metadata": {}, "metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [ "source": [
"## Campaign area" "## Campaign area"
] ]
@ -3902,9 +3877,7 @@
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6", "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
"metadata": { "metadata": {},
"jp-MarkdownHeadingCollapsed": true
},
"source": [ "source": [
"## Exploration variables" "## Exploration variables"
] ]