From 5f621c23525765d8e7a6dc90d6ff2ebdab381b3e Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sun, 11 Feb 2024 22:55:11 +0000 Subject: [PATCH] Ajout fichier .py pour nettoyage et fusions --- 0_Cleaning_and_merge.py | 89 ++++++++++ 0_Cleaning_and_merge_functions.py | 261 ++++++++++++++++++++++++++++++ 0_KPI_functions.py | 82 ++++++++++ 2_Regression_logistique.ipynb | 33 ++++ 4 files changed, 465 insertions(+) create mode 100644 0_Cleaning_and_merge.py create mode 100644 0_Cleaning_and_merge_functions.py create mode 100644 0_KPI_functions.py create mode 100644 2_Regression_logistique.ipynb diff --git a/0_Cleaning_and_merge.py b/0_Cleaning_and_merge.py new file mode 100644 index 0000000..f461547 --- /dev/null +++ b/0_Cleaning_and_merge.py @@ -0,0 +1,89 @@ +# Business Data Challenge - Team 1 + +import pandas as pd +import numpy as np +import os +import s3fs +import re +import warnings + +# Import cleaning and merge functions +exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read()) +exec(open('BDC-team-1/0_KPI_functions.py').read()) + +# Create filesystem object +S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] +fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) + +# Ignore warning +warnings.filterwarnings('ignore') + +# Data loading +BUCKET = "bdc2324-data/1" +liste_database = fs.ls(BUCKET) + +# loop to create dataframes from liste +client_number = liste_database[0].split("/")[1] +df_prefix = "df" + str(client_number) + "_" + +for i in range(len(liste_database)) : + current_path = liste_database[i] + with fs.open(current_path, mode="rb") as file_in: + df = pd.read_csv(file_in) + # the pattern of the name is df1xxx + nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3) + globals()[nom_dataframe] = df + +# Cleaning customerplus +df1_customerplus_clean = preprocessing_customerplus(df1_customersplus) + +# Cleaning ticket area +df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs) + +# Cleaning target area +df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings) + +# Cleaning campaign area +df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns) + +# Cleaning product area +BUCKET = "bdc2324-data" +directory_path = '1' + +products_theme = create_products_table() +events_theme= create_events_table() +representation_theme = create_representations_table() +products_global = uniform_product_df() + +# Fusion liée au product +df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner') + +# Selection des variables d'intérêts +df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] + +# Fusion de l'ensemble et creation des KPI +df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) + +df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced) + +# Fusion avec KPI liés au customer +df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left') + +# Fill NaN values +df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) + +# Fusion avec KPI liés au comportement d'achat +df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer') + +# Fill NaN values +df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) + +## Exportation + +# Exportation vers 'projet-bdc2324-team1' +BUCKET_OUT = "projet-bdc2324-team1" +FILE_KEY_OUT_S3 = "1_Output/Company 1 - Segmentation base.csv" +FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 + +with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + df1_customer_product.to_csv(file_out, index = False) diff --git a/0_Cleaning_and_merge_functions.py b/0_Cleaning_and_merge_functions.py new file mode 100644 index 0000000..15a24dc --- /dev/null +++ b/0_Cleaning_and_merge_functions.py @@ -0,0 +1,261 @@ +# Cleaning and merge functions + +# Cleaning function +def cleaning_date(df, column_name): + """ + Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601. + + Parameters: + - df: DataFrame + Le DataFrame contenant la colonne à nettoyer. + - column_name: str + Le nom de la colonne à nettoyer. + + Returns: + - DataFrame + Le DataFrame modifié avec la colonne nettoyée. + """ + df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601') + return df + +def preprocessing_customerplus(customerplus = None): + + customerplus_copy = customerplus.copy() + + # Passage en format date + cleaning_date(customerplus_copy, 'first_buying_date') + cleaning_date(customerplus_copy, 'last_visiting_date') + + # Selection des variables + customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True) + customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True) + + return customerplus_copy + +def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None): + # Base des tickets + tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']] + tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True) + + # Base des fournisseurs + suppliers = suppliers[['id', 'name']] + suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True) + suppliers['supplier_name'] = suppliers['supplier_name'].fillna('') + + # Base des types de billets + type_ofs = type_ofs[['id', 'name', 'children']] + type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True) + + # Base des achats + # Nettoyage de la date d'achat + cleaning_date(purchases, 'purchase_date') + # Selection des variables + purchases = purchases[['id', 'purchase_date', 'customer_id']] + + # Fusions + # Fusion avec fournisseurs + ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner') + ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True) + + # Fusion avec type de tickets + ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner') + ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True) + + # Fusion avec achats + ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner') + ticket_information.drop(['id'], axis = 1, inplace=True) + + return ticket_information + +def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None): + # Target.csv cleaning + targets = targets[["id", "target_type_id", "name"]] + targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True) + + # target_type cleaning + target_types = target_types[["id","is_import","name"]].add_prefix("target_type_") + + #customer_target_mappings cleaning + customer_target_mappings = customer_target_mappings[["id", "customer_id", "target_id"]] + + # Merge target et target_type + targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner') + targets_full.drop(['target_type_id'], axis = 1, inplace=True) + + # Merge + targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner') + targets_full.drop(['target_id'], axis = 1, inplace=True) + + return targets_full + +def preprocessing_campaigns_area(campaign_stats = None, campaigns = None): + # campaign_stats cleaning + campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]] + cleaning_date(campaign_stats, 'opened_at') + cleaning_date(campaign_stats, 'sent_at') + cleaning_date(campaign_stats, 'delivered_at') + + # campaigns cleaning + campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_") + cleaning_date(campaigns, 'campaign_sent_at') + + # Merge + campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left") + campaigns_full.drop(['campaign_id'], axis = 1, inplace=True) + + return campaigns_full + +def display_databases(file_name): + """ + This function returns the file from s3 storage + """ + file_path = BUCKET + "/" + directory_path + "/" + file_name + print("File path : ", file_path) + with fs.open(file_path, mode="rb") as file_in: + df = pd.read_csv(file_in, sep=",") + + print("Shape : ", df.shape) + return df + + +def remove_horodates(df): + """ + this function remove horodate columns like created_at and updated_at + """ + df = df.drop(columns = ["created_at", "updated_at"]) + return df + + +def order_columns_id(df): + """ + this function puts all id columns at the beginning in order to read the dataset easier + """ + substring = 'id' + id_columns = [col for col in df.columns if substring in col] + remaining_col = [col for col in df.columns if substring not in col] + new_order = id_columns + remaining_col + return df[new_order] + + +def process_df_2(df): + """ + This function organizes dataframe + """ + df = remove_horodates(df) + print("Number of columns : ", len(df.columns)) + df = order_columns_id(df) + print("Columns : ", df.columns) + return df + +def load_dataset(name): + """ + This function loads csv file + """ + df = display_databases(name) + df = process_df_2(df) + # drop na : + #df = df.dropna(axis=1, thresh=len(df)) + # if identifier in table : delete it + if 'identifier' in df.columns: + df = df.drop(columns = 'identifier') + return df + +def create_products_table(): + # first merge products and categories + print("first merge products and categories") + products = load_dataset("1products.csv") + categories = load_dataset("1categories.csv") + # Drop useless columns + products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption']) + categories = categories.drop(columns = ['extra_field', 'quota']) + + #Merge + products_theme = products.merge(categories, how = 'left', left_on = 'category_id', + right_on = 'id', suffixes=('_products', '_categories')) + products_theme = products_theme.rename(columns = {"name" : "name_categories"}) + + # Second merge products_theme and type of categories + print("Second merge products_theme and type of categories") + type_of_categories = load_dataset("1type_of_categories.csv") + type_of_categories = type_of_categories.drop(columns = 'id') + products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id', + right_on = 'category_id' ) + + # Index cleaning + products_theme = products_theme.drop(columns = ['id_categories']) + products_theme = order_columns_id(products_theme) + return products_theme + + +def create_events_table(): + # first merge events and seasons : + print("first merge events and seasons : ") + events = load_dataset("1events.csv") + seasons = load_dataset("1seasons.csv") + + # Drop useless columns + events = events.drop(columns = ['manual_added', 'is_display']) + seasons = seasons.drop(columns = ['start_date_time']) + + events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons')) + + # Secondly merge events_theme and event_types + print("Secondly merge events_theme and event_types : ") + event_types = load_dataset("1event_types.csv") + event_types = event_types.drop(columns = ['fidelity_delay']) + + events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type')) + events_theme = events_theme.rename(columns = {"name" : "name_event_types"}) + events_theme = events_theme.drop(columns = 'id') + + # thirdly merge events_theme and facilities + print("thirdly merge events_theme and facilities : ") + facilities = load_dataset("1facilities.csv") + facilities = facilities.drop(columns = ['fixed_capacity']) + + events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility')) + events_theme = events_theme.rename(columns = {"name" : "name_facilities", "id_events" : "event_id"}) + events_theme = events_theme.drop(columns = 'id') + + # Index cleaning + events_theme = events_theme.drop(columns = ['id_seasons']) + events_theme = order_columns_id(events_theme) + return events_theme + + +def create_representations_table(): + representations = load_dataset("1representations.csv") + representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling', + 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name', + 'representation_type_id']) + + representations_capacity = load_dataset("1representation_category_capacities.csv") + representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling']) + + representations_theme = representations.merge(representations_capacity, how='left', + left_on='id', right_on='representation_id', + suffixes=('_representation', '_representation_cap')) + # index cleaning + representations_theme = representations_theme.drop(columns = ["id_representation"]) + representations_theme = order_columns_id(representations_theme) + return representations_theme + +def uniform_product_df(): + """ + This function returns the uniform product dataset + """ + print("Products theme columns : ", products_theme.columns) + print("\n Representation theme columns : ", representation_theme.columns) + print("\n Events theme columns : ", events_theme.columns) + + products_global = products_theme.merge(representation_theme, how='left', + on= ["representation_id", "category_id"]) + + products_global = products_global.merge(events_theme, how='left', on='event_id', + suffixes = ("_representation", "_event")) + + products_global = order_columns_id(products_global) + + # remove useless columns + products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories' + return products_global \ No newline at end of file diff --git a/0_KPI_functions.py b/0_KPI_functions.py new file mode 100644 index 0000000..d79638a --- /dev/null +++ b/0_KPI_functions.py @@ -0,0 +1,82 @@ +# Function de construction de KPI + +def campaigns_kpi_function(campaigns_information = None): + # Nombre de campagnes de mails + nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() + nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) + # Temps d'ouverture en min moyen + campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at'] + time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() + + # Nombre de mail ouvert + opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']] + opened_campaign.dropna(subset=['opened_at'], inplace=True) + opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() + opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True) + + # Fusion des indicateurs + campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left') + campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left') + + # Remplir les NaN : nb_campaigns_opened + campaigns_reduced['nb_campaigns_opened'].fillna(0) + + # Remplir les NaT : time_to_open (??) + + return campaigns_reduced + + +def tickets_kpi_function(tickets_information = None): + + tickets_information_copy = tickets_information.copy() + + # Dummy : Canal de vente en ligne + liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance + tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int) + + # Proportion de vente en ligne + prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index() + prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True) + + # Average amount + avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types']) + .agg({"amount" : "mean"}).reset_index() + .rename(columns = {'amount' : 'avg_amount'})) + + + tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']] + .groupby(['customer_id', 'event_type_id']) + .agg({'ticket_id': 'count', + 'purchase_id' : 'nunique', + 'amount' : 'sum', + 'supplier_name': 'nunique', + 'vente_internet' : 'max', + 'purchase_date' : ['min', 'max']}) + .reset_index() + ) + + tickets_kpi.columns = tickets_kpi.columns.map('_'.join) + + tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', + 'purchase_id_nunique' : 'nb_purchases', + 'amount_sum' : 'total_amount', + 'supplier_name_nunique' : 'nb_suppliers', + 'customer_id_' : 'customer_id', + 'event_type_id_' : 'event_type_id'}, inplace = True) + + tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min'] + tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours + + # Convertir date et en chiffre + max_date = tickets_kpi['purchase_date_max'].max() + tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D') + tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D') + + + tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left') + tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0) + + tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id') + + return tickets_kpi + \ No newline at end of file diff --git a/2_Regression_logistique.ipynb b/2_Regression_logistique.ipynb new file mode 100644 index 0000000..2cbcba7 --- /dev/null +++ b/2_Regression_logistique.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2", + "metadata": {}, + "source": [ + "# Segmentation des clients par régression logistique" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}