BDC-team-1/Exploration_billet_AJ.ipynb

1298 lines
34 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 1,
2024-02-10 22:46:56 +01:00
"id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-01-13 10:38:10 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 2,
2024-02-10 22:46:56 +01:00
"id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
2024-01-13 10:38:10 +01:00
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "c437eaec",
"metadata": {},
"source": [
2024-01-13 10:38:10 +01:00
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "a1c1fc39",
2024-01-13 10:38:10 +01:00
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 3,
2024-02-10 22:46:56 +01:00
"id": "66f8c17b",
"metadata": {},
2024-01-13 10:38:10 +01:00
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
2024-02-05 22:04:02 +01:00
"execution_count": 5,
2024-02-10 22:46:56 +01:00
"id": "c08e6798",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
2024-01-13 14:14:11 +01:00
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
2024-01-13 10:38:10 +01:00
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 6,
2024-02-10 22:46:56 +01:00
"id": "675f518d",
2024-01-13 10:38:10 +01:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-02-10 22:46:56 +01:00
"<<<<<<< local <modified: >\n",
2024-02-10 13:23:44 +01:00
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-10 22:46:56 +01:00
" df = pd.read_csv(file_in)\n",
"=======\n",
2024-02-07 23:28:55 +01:00
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-10 22:46:56 +01:00
" df = pd.read_csv(file_in)\n",
">>>>>>> remote <modified: >\n"
2024-01-13 10:38:10 +01:00
]
}
],
"source": [
2024-02-04 16:02:01 +01:00
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
2024-01-13 10:38:10 +01:00
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
2024-02-05 22:04:02 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "e855f403",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-02-05 22:04:02 +01:00
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "91a8f8c4",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
2024-02-10 22:46:56 +01:00
"id": "2fda171d",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
2024-02-10 22:46:56 +01:00
"id": "205eeeab",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
2024-02-10 22:46:56 +01:00
"id": "634282c5",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
2024-02-10 22:46:56 +01:00
"id": "0e8d4133",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
2024-02-10 22:46:56 +01:00
"id": "1268ad5a",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
2024-02-10 22:46:56 +01:00
"id": "bd41dc80",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "64d0f76b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
2024-02-10 22:46:56 +01:00
"id": "7e683711",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": 7,
2024-02-10 22:46:56 +01:00
"id": "e7b9a52e",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "568280e8",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
2024-02-10 22:46:56 +01:00
"id": "29ecec90",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
2024-02-05 22:04:02 +01:00
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
2024-01-13 10:38:10 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "22bb5de4",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 10:38:10 +01:00
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "6a9a91f4",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 11,
2024-02-10 22:46:56 +01:00
"id": "bab4758a",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
2024-02-10 22:46:56 +01:00
"id": "b5fff251",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
2024-01-13 10:38:10 +01:00
]
},
2024-01-13 14:14:11 +01:00
{
"cell_type": "code",
"execution_count": 13,
2024-02-10 22:46:56 +01:00
"id": "8b09e2a3",
2024-01-13 14:14:11 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
2024-01-13 10:38:10 +01:00
]
},
{
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "ecee7cdc",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "c8e6e69b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
2024-02-10 22:46:56 +01:00
"id": "1a6cff1f",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": 16,
2024-02-10 22:46:56 +01:00
"id": "93630b41",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
2024-02-10 22:46:56 +01:00
"id": "4f94481a",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "1b2811e2",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
2024-02-10 22:46:56 +01:00
"id": "2455d2e1",
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": 19,
2024-02-10 22:46:56 +01:00
"id": "5f9a159d",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
2024-02-10 22:46:56 +01:00
"id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 21,
2024-02-10 22:46:56 +01:00
"id": "bd436fca",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "83435862",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "f210e730",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": 23,
2024-02-10 22:46:56 +01:00
"id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
2024-02-10 22:46:56 +01:00
"id": "83a4d021",
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "56e6ebd1",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 51,
2024-02-10 22:46:56 +01:00
"id": "88fcde4b",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
2024-02-04 16:02:01 +01:00
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "818f69db",
2024-02-04 16:02:01 +01:00
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 23,
2024-02-10 22:46:56 +01:00
"id": "c9654eda",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_ticket_information"
]
},
2024-02-04 16:02:01 +01:00
{
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "7f2b620c",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "637bdb72",
"metadata": {},
"source": [
2024-02-04 16:02:01 +01:00
"# Customer information"
]
},
{
2024-02-04 16:02:01 +01:00
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "14c52894",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
2024-02-04 16:02:01 +01:00
"## Target area"
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "d83abfbf",
"metadata": {},
2024-02-04 16:02:01 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
2024-02-04 16:02:01 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
2024-02-04 16:02:01 +01:00
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 62,
2024-02-10 22:46:56 +01:00
"id": "90d71b2c",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "2301de1e",
2024-02-07 23:28:55 +01:00
"metadata": {},
2024-02-04 16:02:01 +01:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-07 23:28:55 +01:00
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
2024-02-04 16:02:01 +01:00
"\n",
2024-02-07 23:28:55 +01:00
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
2024-02-04 16:02:01 +01:00
]
},
2024-02-07 23:28:55 +01:00
"execution_count": 10,
2024-02-04 16:02:01 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-07 23:28:55 +01:00
"df1_targets_full.head()"
2024-02-04 16:02:01 +01:00
]
},
{
2024-02-07 23:28:55 +01:00
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "75fbc2f7",
2024-02-07 23:28:55 +01:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 19,
2024-02-10 22:46:56 +01:00
"id": "55cddf92",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
2024-02-07 23:28:55 +01:00
"name": "stdout",
2024-02-04 16:02:01 +01:00
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
2024-02-04 16:02:01 +01:00
]
}
],
"source": [
2024-02-07 23:28:55 +01:00
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
2024-02-04 16:02:01 +01:00
"\n",
"\n",
2024-02-07 23:28:55 +01:00
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "7fd98a85",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
2024-02-04 16:02:01 +01:00
]
}
],
"source": [
2024-02-07 23:28:55 +01:00
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 18,
2024-02-10 22:46:56 +01:00
"id": "cf94bb1d",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
2024-02-07 23:28:55 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
2024-02-04 16:02:01 +01:00
}
],
2024-02-07 23:28:55 +01:00
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
2024-02-10 22:46:56 +01:00
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3fc1f446",
"metadata": {},
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "20e69ee3",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": 37,
2024-02-10 22:46:56 +01:00
"id": "d9cbdbce",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": 38,
2024-02-10 22:46:56 +01:00
"id": "c07459f0",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "80ae4c42",
2024-02-10 13:23:44 +01:00
"metadata": {},
2024-02-04 16:02:01 +01:00
"source": [
"## Exploration variables"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 7,
2024-02-10 22:46:56 +01:00
"id": "b50b8f95",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "7e292935",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 9,
2024-02-10 22:46:56 +01:00
"id": "05b6f2b0",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
2024-02-10 13:23:44 +01:00
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 11,
2024-02-10 22:46:56 +01:00
"id": "10304058",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 32,
2024-02-10 22:46:56 +01:00
"id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
2024-02-10 13:23:44 +01:00
" client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
2024-02-10 13:23:44 +01:00
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "70bdc88d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
2024-02-10 22:46:56 +01:00
"id": "6a0f567d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
"\n",
2024-02-10 13:23:44 +01:00
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
2024-02-10 13:23:44 +01:00
},
{
"cell_type": "code",
"execution_count": 63,
2024-02-10 22:46:56 +01:00
"id": "1522d8cd",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 66,
2024-02-10 22:46:56 +01:00
"id": "b0e42a61",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
2024-02-10 22:46:56 +01:00
"id": "d299ae91",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tenant_id\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"5 1\n",
"6 1\n",
"7 1\n",
"8 1\n",
"9 1\n",
"10 1\n",
"11 1\n",
"12 1\n",
"13 1\n",
"14 1\n",
"101 1\n",
"Name: canal_vente_internet, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}