1296 lines
34 KiB
Plaintext
1296 lines
34 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5bf5c226",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Business Data Challenge - Team 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "b1a5b9d3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import os\n",
|
|
"import s3fs\n",
|
|
"import re"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ecfa2219",
|
|
"metadata": {},
|
|
"source": [
|
|
"Configuration de l'accès aux données"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "1a094277",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create filesystem object\n",
|
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c437eaec",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Exemple sur Company 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a1c1fc39",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Chargement données"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "66f8c17b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"BUCKET = \"bdc2324-data/1\"\n",
|
|
"liste_database = fs.ls(BUCKET)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "c08e6798",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
|
|
"\n",
|
|
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
|
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
|
|
"\n",
|
|
"# Afficher le résultat\n",
|
|
"print(liste_database_filtered)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "675f518d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<<<<<<< local <modified: >\n",
|
|
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
" df = pd.read_csv(file_in)\n",
|
|
"=======\n",
|
|
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
" df = pd.read_csv(file_in)\n",
|
|
">>>>>>> remote <modified: >\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# loop to create dataframes from liste\n",
|
|
"files_path = liste_database\n",
|
|
"\n",
|
|
"client_number = files_path[0].split(\"/\")[1]\n",
|
|
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
|
"\n",
|
|
"for i in range(len(files_path)) :\n",
|
|
" current_path = files_path[i]\n",
|
|
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
|
" df = pd.read_csv(file_in)\n",
|
|
" # the pattern of the name is df1xxx\n",
|
|
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
|
" globals()[nom_dataframe] = df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e855f403",
|
|
"metadata": {},
|
|
"source": [
|
|
"## customersplus.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "91a8f8c4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"a = pd.DataFrame(df1_customersplus.info())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"id": "2fda171d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def info_colonnes_dataframe(df):\n",
|
|
" # Créer une liste pour stocker les informations sur chaque colonne\n",
|
|
" infos_colonnes = []\n",
|
|
"\n",
|
|
" # Parcourir les colonnes du DataFrame\n",
|
|
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
|
|
" # Calculer le taux de valeurs manquantes\n",
|
|
" taux_na = serie.isna().mean() * 100\n",
|
|
"\n",
|
|
" # Ajouter les informations à la liste\n",
|
|
" infos_colonnes.append({\n",
|
|
" 'Nom_colonne': nom_colonne,\n",
|
|
" 'Type_colonne': str(serie.dtype),\n",
|
|
" 'Taux_NA': taux_na\n",
|
|
" })\n",
|
|
"\n",
|
|
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
|
|
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
|
|
"\n",
|
|
" return df_infos_colonnes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"id": "205eeeab",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cleaning_date(df, column_name):\n",
|
|
" \"\"\"\n",
|
|
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
|
"\n",
|
|
" Parameters:\n",
|
|
" - df: DataFrame\n",
|
|
" Le DataFrame contenant la colonne à nettoyer.\n",
|
|
" - column_name: str\n",
|
|
" Le nom de la colonne à nettoyer.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" - DataFrame\n",
|
|
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
|
" \"\"\"\n",
|
|
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "634282c5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"a = info_colonnes_dataframe(df1_customersplus)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"id": "0e8d4133",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"a"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "1268ad5a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"id": "bd41dc80",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Selection des variables\n",
|
|
"df1_customersplus_clean = df1_customersplus.copy()\n",
|
|
"\n",
|
|
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
|
|
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
|
|
"\n",
|
|
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
|
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "64d0f76b",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## tickets.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "7e683711",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_tickets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "e7b9a52e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_tickets.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "568280e8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_tickets.isna().sum()/len(df1_tickets)*100"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "29ecec90",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Selection des variables\n",
|
|
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
|
|
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "22bb5de4",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## suppliers.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "6a9a91f4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_suppliers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "bab4758a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_suppliers.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "b5fff251",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "8b09e2a3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Selection des variables\n",
|
|
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
|
|
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "ecee7cdc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_suppliers_clean"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c8e6e69b",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## type_ofs.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "1a6cff1f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_type_ofs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "93630b41",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_type_ofs.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "4f94481a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Selection des variables\n",
|
|
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
|
|
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1b2811e2",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## purchases.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "2455d2e1",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_purchases"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "5f9a159d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_purchases.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "db201bf7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Nettoyage purchase_date\n",
|
|
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
|
|
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "bd436fca",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_purchases.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "83435862",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Selection des variables\n",
|
|
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f210e730",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## Fusion de l'ensemble des données billétiques"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "1f8b3aa7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Fusion avec fournisseurs\n",
|
|
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
|
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
|
"\n",
|
|
"# Fusion avec type de tickets\n",
|
|
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
|
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
|
"\n",
|
|
"# Fusion avec achats\n",
|
|
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
|
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "83a4d021",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_ticket_information"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "56e6ebd1",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"# Utilisation de fonctions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"id": "88fcde4b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Créer un DataFrame exemple\n",
|
|
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
|
|
"\n",
|
|
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
|
|
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
|
|
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
|
|
"\n",
|
|
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
|
|
"\n",
|
|
"test.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "818f69db",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Nettoyage, selection et fusion"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "c9654eda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_ticket_information"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "7f2b620c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_ticket_information.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "637bdb72",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Customer information"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "14c52894",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## Target area"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "d83abfbf",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Target.csv cleaning\n",
|
|
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
|
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
|
"\n",
|
|
"# target_type cleaning\n",
|
|
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
|
"\n",
|
|
"#customer_target_mappings cleaning\n",
|
|
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
|
"\n",
|
|
"# Merge target et target_type\n",
|
|
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
|
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
|
"\n",
|
|
"# Merge\n",
|
|
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
|
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 62,
|
|
"id": "90d71b2c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
|
|
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
|
|
"\n",
|
|
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
|
|
"df1_targets_test.mean()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "2301de1e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>customer_id</th>\n",
|
|
" <th>target_name</th>\n",
|
|
" <th>target_type_is_import</th>\n",
|
|
" <th>target_type_name</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1184824</td>\n",
|
|
" <td>645400</td>\n",
|
|
" <td>DDCP PROMO Réseau livres</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>manual_static_filter</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>210571</td>\n",
|
|
" <td>2412</td>\n",
|
|
" <td>DDCP PROMO Réseau livres</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>manual_static_filter</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>210572</td>\n",
|
|
" <td>4536</td>\n",
|
|
" <td>DDCP PROMO Réseau livres</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>manual_static_filter</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>210573</td>\n",
|
|
" <td>6736</td>\n",
|
|
" <td>DDCP PROMO Réseau livres</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>manual_static_filter</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>210574</td>\n",
|
|
" <td>38210</td>\n",
|
|
" <td>DDCP PROMO Réseau livres</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>manual_static_filter</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" id customer_id target_name target_type_is_import \\\n",
|
|
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
|
|
"1 210571 2412 DDCP PROMO Réseau livres False \n",
|
|
"2 210572 4536 DDCP PROMO Réseau livres False \n",
|
|
"3 210573 6736 DDCP PROMO Réseau livres False \n",
|
|
"4 210574 38210 DDCP PROMO Réseau livres False \n",
|
|
"\n",
|
|
" target_type_name \n",
|
|
"0 manual_static_filter \n",
|
|
"1 manual_static_filter \n",
|
|
"2 manual_static_filter \n",
|
|
"3 manual_static_filter \n",
|
|
"4 manual_static_filter "
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df1_targets_full.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "75fbc2f7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
|
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Catégorisation des target_name\n",
|
|
"import pandas as pd\n",
|
|
"import nltk\n",
|
|
"from nltk.tokenize import word_tokenize\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"from nltk.stem import WordNetLemmatizer\n",
|
|
"from nltk.probability import FreqDist\n",
|
|
"\n",
|
|
"# Téléchargement des ressources nécessaires\n",
|
|
"nltk.download('punkt')\n",
|
|
"nltk.download('stopwords')\n",
|
|
"nltk.download('wordnet')\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "55cddf92",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Mots les plus fréquents:\n",
|
|
"consentement: 550777\n",
|
|
"optin: 463579\n",
|
|
"jeune: 155103\n",
|
|
"public: 155103\n",
|
|
"mediation: 150001\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
|
|
"def preprocess_text(texte):\n",
|
|
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
|
|
" texte_concat = ' '.join(texte)\n",
|
|
" \n",
|
|
" # Tokenisation des mots\n",
|
|
" tokens = word_tokenize(texte_concat.lower())\n",
|
|
" \n",
|
|
" # Suppression des mots vides (stopwords)\n",
|
|
" stop_words = set(stopwords.words('french'))\n",
|
|
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
|
|
" \n",
|
|
" # Lemmatisation des mots\n",
|
|
" lemmatizer = WordNetLemmatizer()\n",
|
|
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
|
|
" \n",
|
|
" return lemmatized_tokens\n",
|
|
"\n",
|
|
"\n",
|
|
"# Appliquer le prétraitement à la colonne de texte\n",
|
|
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
|
|
"\n",
|
|
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
|
|
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
|
|
"\n",
|
|
"# Calculer la fréquence des mots\n",
|
|
"freq_dist = FreqDist(all_words)\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "7fd98a85",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Mots les plus fréquents:\n",
|
|
"consentement: 550777\n",
|
|
"optin: 463579\n",
|
|
"jeune: 155103\n",
|
|
"public: 155103\n",
|
|
"mediation: 150001\n",
|
|
"specialisee: 150001\n",
|
|
"b2c: 143432\n",
|
|
"optout: 97683\n",
|
|
"newsletter: 56022\n",
|
|
"(: 46084\n",
|
|
"): 46084\n",
|
|
"inscrits: 42296\n",
|
|
"nl: 42294\n",
|
|
"générale: 41037\n",
|
|
"generale: 40950\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Affichage des mots les plus fréquents\n",
|
|
"print(\"Mots les plus fréquents:\")\n",
|
|
"for mot, freq in freq_dist.most_common(15):\n",
|
|
" print(f\"{mot}: {freq}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "cf94bb1d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" texte \\\n",
|
|
"0 Le chat noir mange une souris. \n",
|
|
"1 Le chien blanc aboie. \n",
|
|
"\n",
|
|
" texte_preprocessed \n",
|
|
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
|
|
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
|
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
|
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import nltk\n",
|
|
"from nltk.tokenize import word_tokenize\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"from nltk.stem import WordNetLemmatizer\n",
|
|
"\n",
|
|
"# Téléchargement des ressources nécessaires\n",
|
|
"nltk.download('punkt')\n",
|
|
"nltk.download('stopwords')\n",
|
|
"nltk.download('wordnet')\n",
|
|
"\n",
|
|
"# Création de la DataFrame d'exemple\n",
|
|
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
|
|
"df = pd.DataFrame(data)\n",
|
|
"\n",
|
|
"# Fonction pour prétraiter le texte\n",
|
|
"def preprocess_text(texte):\n",
|
|
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
|
|
" texte_concat = ' '.join(texte)\n",
|
|
" \n",
|
|
" # Tokenisation des mots\n",
|
|
" tokens = word_tokenize(texte_concat.lower())\n",
|
|
" \n",
|
|
" # Suppression des mots vides (stopwords)\n",
|
|
" stop_words = set(stopwords.words('french'))\n",
|
|
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
|
|
" \n",
|
|
" # Lemmatisation des mots\n",
|
|
" lemmatizer = WordNetLemmatizer()\n",
|
|
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
|
|
" \n",
|
|
" return lemmatized_tokens\n",
|
|
"\n",
|
|
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
|
|
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
|
|
"\n",
|
|
"# Afficher le résultat\n",
|
|
"print(df)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "711d3884",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## Campaign area"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"id": "c25b5295",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# campaign_stats cleaning \n",
|
|
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
|
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
|
|
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
|
|
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
|
|
"\n",
|
|
"# campaigns cleaning\n",
|
|
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
|
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
|
|
"\n",
|
|
"# Merge \n",
|
|
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
|
|
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 53,
|
|
"id": "2a3de6a5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_campaigns_full.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"id": "3fc1f446",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_campaigns_information"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "20e69ee3",
|
|
"metadata": {
|
|
"jp-MarkdownHeadingCollapsed": true
|
|
},
|
|
"source": [
|
|
"## Link area"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"id": "d9cbdbce",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_campaigns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"id": "c07459f0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_link_stats"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "80ae4c42",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Exploration variables"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "b50b8f95",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
|
|
"def suppliers_exploration(suppliers = None) : \n",
|
|
" \n",
|
|
" # Taux de NaN pour ces colonnes\n",
|
|
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
|
|
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
|
|
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
|
|
"\n",
|
|
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
|
|
" 'label_na' : [label_na],\n",
|
|
" 'itr_na' : [itr_na],\n",
|
|
" 'commission_na' : [commission_na]})\n",
|
|
"\n",
|
|
" return suppliers_desc"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "7e292935",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "05b6f2b0",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>nb_suppliers</th>\n",
|
|
" <th>label_na</th>\n",
|
|
" <th>itr_na</th>\n",
|
|
" <th>commission_na</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>9</td>\n",
|
|
" <td>100.0</td>\n",
|
|
" <td>100.0</td>\n",
|
|
" <td>100.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" nb_suppliers label_na itr_na commission_na\n",
|
|
"0 9 100.0 100.0 100.0"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df1_suppliers_desc"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "c9324d80",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"BUCKET = \"bdc2324-data\"\n",
|
|
"liste_folders = fs.ls(BUCKET)\n",
|
|
"\n",
|
|
"liste_files = []\n",
|
|
"for company_folder in liste_folders : \n",
|
|
" liste_files.extend(fs.ls(company_folder))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "10304058",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"liste_database_select = ['suppliers']\n",
|
|
"\n",
|
|
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
|
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
|
|
"\n",
|
|
"# Afficher le résultat\n",
|
|
"print(liste_suppliers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "ffa423e5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# loop to create dataframes from file 2\n",
|
|
"def database_loading(database_name = None):\n",
|
|
" files_path = database_name\n",
|
|
" \n",
|
|
" client_number = files_path.split(\"/\")[1]\n",
|
|
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
|
" \n",
|
|
" current_path = files_path\n",
|
|
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
|
" df = pd.read_csv(file_in)\n",
|
|
"\n",
|
|
" return df, client_number"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "70bdc88d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"id": "6a0f567d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_all = pd.DataFrame()\n",
|
|
"\n",
|
|
"for link in liste_suppliers:\n",
|
|
" \n",
|
|
" df_supplier, tenant_id = database_loading(link)\n",
|
|
" \n",
|
|
" df_supplier['tenant_id'] = int(tenant_id)\n",
|
|
"\n",
|
|
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 63,
|
|
"id": "1522d8cd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 66,
|
|
"id": "b0e42a61",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
|
|
"# vad = vente à distance\n",
|
|
"df_all['name'] = df_all['name'].fillna('')\n",
|
|
"\n",
|
|
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 68,
|
|
"id": "d299ae91",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tenant_id\n",
|
|
"1 1\n",
|
|
"2 1\n",
|
|
"3 1\n",
|
|
"4 1\n",
|
|
"5 1\n",
|
|
"6 1\n",
|
|
"7 1\n",
|
|
"8 1\n",
|
|
"9 1\n",
|
|
"10 1\n",
|
|
"11 1\n",
|
|
"12 1\n",
|
|
"13 1\n",
|
|
"14 1\n",
|
|
"101 1\n",
|
|
"Name: canal_vente_internet, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 68,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|