Suppression des notebooks exploratoires et brouillons

changed names
fixed tipos
2024-04-09 20:20:57 +00:00 · 2024-04-04 18:48:46 +00:00 · 2024-04-04 18:46:38 +00:00 · 2024-04-04 14:57:39 +00:00 · 2024-04-04 14:29:16 +00:00 · 2024-04-04 11:46:15 +00:00
30 changed files with 2762 additions and 34698 deletions
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -1,128 +0,0 @@
-# Business Data Challenge - Team 1
-
-import pandas as pd
-import numpy as np
-import os
-import s3fs
-import re
-import warnings
-
-# Create filesystem object
-S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
-fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
-
-
-# Import cleaning and merge functions
-exec(open('0_KPI_functions.py').read())
-
-# Ignore warning
-warnings.filterwarnings('ignore')
-
-def dataset_construction(min_date, end_features_date, max_date, directory_path):
-    
-    # Import customerplus
-    df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
-    df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
-    df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
-    
-    # Filtre de cohérence pour la mise en pratique de notre méthode
-    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
-    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
-    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
-
-    #Filtre de la base df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
-    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
-    
-    #Filtre de la base df_products_purchased_reduced
-    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
-
-    print("Data filtering : SUCCESS")
-    
-    # Fusion de l'ensemble et creation des KPI
-
-    # KPI sur les campagnes publicitaires
-    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) 
-
-    # KPI sur le comportement d'achat
-    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
-
-    # KPI sur les données socio-demographique    
-
-    ## Le genre
-    df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
-        0: 'female',
-        1: 'male',
-        2: 'other'
-    })
-    gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
-    df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
-
-    ## Indicatrice si individue vit en France
-    df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
-
-    print("KPIs construction : SUCCESS")
-    
-    # Fusion avec KPI liés au customer
-    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
-    
-    # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
-    
-    # Fusion avec KPI liés au comportement d'achat
-    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
-    
-    # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
-
-    print("Explanatory variable construction : SUCCESS")
-
-    # 2. Construction of the explained variable 
-    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
-
-    # Indicatrice d'achat
-    df_products_purchased_to_predict['y_has_purchased'] = 1
-
-    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
-
-    print("Explained variable construction : SUCCESS")
-    
-    # 3. Merge between explained and explanatory variables
-    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
-
-    # 0 if there is no purchase
-    dataset[['y_has_purchased']].fillna(0)    
-    
-    return dataset
-
-## Exportation
-
-# Dossier d'exportation
-BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
-
-# Dataset test
-dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
-
-# # Exportation
-# FILE_KEY_OUT_S3 = "dataset_test.csv"
-# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-#     dataset_test.to_csv(file_out, index = False)
-
-# print("Exportation dataset test : SUCCESS")
-
-# Dataset train
-dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
-
-# Exportation
-# FILE_KEY_OUT_S3 = "dataset_train.csv"
-# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-#     dataset_train.to_csv(file_out, index = False)
-    
-# print("Exportation dataset train : SUCCESS")
-
-
-print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -1,97 +0,0 @@
-# Function de construction de KPI
-
-def custom_date_parser(date_string):
-    return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
-
-def display_databases(directory_path, file_name, datetime_col = None):
-    """
-    This function returns the file from s3 storage 
-    """
-    file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
-    print("File path : ", file_path)
-    with fs.open(file_path, mode="rb") as file_in:
-        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
-    return df       
-
-def campaigns_kpi_function(campaigns_information = None):
-     
-    # Nombre de campagnes de mails
-    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
-    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
-    # Temps d'ouverture en min moyen    
-    campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
-    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
-
-    # Nombre de mail ouvert    
-    opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
-    opened_campaign.dropna(subset=['opened_at'], inplace=True)
-    opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
-    opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)
-
-    # Fusion des indicateurs
-    campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
-    campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
-
-    # Remplir les NaN : nb_campaigns_opened
-    campaigns_reduced['nb_campaigns_opened'].fillna(0)
-
-    # Remplir les NaT : time_to_open (??)
-
-    return campaigns_reduced
-
-
-def tickets_kpi_function(tickets_information = None):
-
-    tickets_information_copy = tickets_information.copy()
-    
-    # Dummy : Canal de vente en ligne
-    liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
-    tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
-
-    # Proportion de vente en ligne
-    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
-    prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
-
-    # Average amount
-    # avg_amount =  (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
-    #           .agg({"amount" : "mean"}).reset_index()
-    #           .rename(columns = {'amount' : 'avg_amount'}))
-
-    
-    tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
-                   .groupby(['customer_id']) 
-                   .agg({'ticket_id': 'count', 
-                         'purchase_id' : 'nunique',
-                         'amount' : 'sum',
-                         'supplier_name': 'nunique',
-                         'vente_internet' : 'max',
-                         'purchase_date' : ['min', 'max']})
-                   .reset_index()
-                  )
-    
-    tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
-    
-    tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
-                                  'purchase_id_nunique' : 'nb_purchases',
-                                  'amount_sum' : 'total_amount',
-                                  'supplier_name_nunique' : 'nb_suppliers', 
-                                  'customer_id_' : 'customer_id'}, inplace = True)
-    
-    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
-    tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
-
-    # Convertir date et en chiffre
-    max_date = tickets_kpi['purchase_date_max'].max()
-    tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
-    tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
-
-    
-    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
-    tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
-
-    # tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
-
-    return tickets_kpi
-
-
-    
--- a/1_Descriptive_Statistics.ipynb
+++ b/1_Descriptive_Statistics.ipynb
--- a/0_1_Input_cleaning.py
+++ b/0_1_Input_cleaning.py
@ -6,13 +6,14 @@ import os
 import s3fs
 import re
 import warnings
+import time

 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

 # Import cleaning and merge functions
-exec(open('0_Cleaning_and_merge_functions.py').read())
+exec(open('utils_cleaning_and_merge.py').read())

 # Output folder 
 BUCKET_OUT = "projet-bdc2324-team1"
@ -20,15 +21,20 @@ BUCKET_OUT = "projet-bdc2324-team1"
 # Ignore warning
 warnings.filterwarnings('ignore')

+start_all = time.time()

 def export_dataset(df, output_name):
-    print('Exportation of dataset :', output_name)
+    print('Export of dataset :', output_name)
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        df.to_csv(file_out, index = False)

 ## 1 - Cleaning of the datasets
-for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"): 
+for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"]:#, "101"
+    
+    # Timer
+    start = time.time()
+    
    # Cleaning customerplus
    df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
    
@ -45,14 +51,22 @@ for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
    ## Exportation 
    export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
    
-    ## Exportation 
-    # export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
+    if tenant_id == "101":
+        # Cleaning product area
+        products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
+        # Exportation 
+        export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
+        export_dataset(df = products_purchased_reduced_1, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced_1.csv")
+    else :
+        # Cleaning product area
+        products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
+        # Exportation 
+        export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
+
    
-    # Cleaning product area
-    df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
-    ## Exportation 
-    export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
    #Exportation 
-    # export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
-
+    # export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
+    print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
    print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
+
+print("Time to run the cleaning of all used datasets : " , time.time() - start_all)
--- a/2_Datasets_Generation.py
+++ b/2_Datasets_Generation.py
@ -0,0 +1,176 @@
+# Purpose of the script : Construction of training and test datasets for modelling by company
+# Input : KPI construction function and clean databases in the 0_Input folder
+# Output : Train and test datasets by compagnies 
+
+# Packages
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+from datetime import date, timedelta, datetime
+from sklearn.model_selection import train_test_split
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+# Import KPI construction functions
+exec(open('utils_features_construction.py').read())
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+
+def dataset_construction(min_date, end_features_date, max_date, directory_path):
+    
+    # Import of cleaned and merged datasets
+    df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
+    df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
+    df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+    df_target_information = display_input_databases(directory_path, file_name = "target_information")
+    
+    # Dates in datetime format
+    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
+    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
+    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
+
+    # Filter for database df_campaigns_information
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
+    
+    # Filter for database df_products_purchased_reduced
+    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+
+    print("Data filtering : SUCCESS")
+    
+    # Building and merging features 
+
+    # Campaigns features
+    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date) 
+
+    # Purchasing behavior features
+    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
+
+    # Socio-demographic features
+    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
+
+    # Targets features
+    df_targets_kpi = targets_KPI(df_target = df_target_information)
+    
+    print("KPIs construction : SUCCESS")
+    
+    # Merge - campaigns features
+    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
+    
+    # Fill NaN values
+    df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
+    df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
+    
+    # Merge - targets features
+    df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
+    
+    # Fill NaN values
+    targets_columns = list(df_targets_kpi.columns)
+    targets_columns.remove('customer_id')
+
+    df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
+
+    # We standardise the number of targets closely linked to the company's operations
+    df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
+    
+    # Merge - purchasing behavior features
+    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
+    
+    # Fill NaN values
+    special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']    
+    simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
+
+    df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
+
+    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
+    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
+    df_customer_product[['time_between_purchase']] = df_customer_product[['time_between_purchase']].fillna(-1)
+
+    # Customers who have neither received an e-mail nor made a purchase during the feature estimation period are removed
+    df_customer_product = df_customer_product[(df_customer_product['nb_purchases'] > 0) | (df_customer_product['nb_campaigns'] > 0)]
+
+    print("Explanatory variable construction : SUCCESS")
+
+    # 2. Construction of the explained variable 
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
+
+    # Construction of the dependant variable
+    df_products_purchased_to_predict['y_has_purchased'] = 1
+
+    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
+
+    print("Explained variable construction : SUCCESS")
+    
+    # 3. Merge between explained and explanatory variables
+    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
+
+    # 0 if there is no purchase
+    dataset[['y_has_purchased']] = dataset[['y_has_purchased']].fillna(0)
+
+    # add id_company prefix to customer_id
+    dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')
+    
+    return dataset
+
+## Exportation
+# Sectors
+companies = {'musee' : ['1', '2', '3', '4'], # , '101'
+            'sport': ['5', '6', '7', '8', '9'],
+            'musique' : ['10', '11', '12', '13', '14']}
+
+# Choosed sector
+type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+list_of_comp = companies[type_of_comp] 
+
+# Export folder
+BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}'
+
+# Dates used for the construction of features and the dependant variable
+start_date = "2021-05-01"
+end_of_features = "2022-11-01"
+final_date = "2023-11-01"
+
+# Anonymous customer to be deleted from the datasets
+anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
+                      '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
+                     '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
+
+for company in list_of_comp:
+    dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
+                                        max_date = final_date, directory_path = company)    
+    
+    # Deletion of the anonymous customer
+    dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
+
+    # Split between train and test
+    dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
+    
+    # Dataset Test
+    # Export
+    FILE_KEY_OUT_S3 = "dataset_test" + company +  ".csv"
+    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
+    
+    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+        dataset_test.to_csv(file_out, index = False)
+    
+    print("Export of dataset test : SUCCESS")
+
+    # Dataset train
+    # Export
+    FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" 
+    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
+    
+    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+        dataset_train.to_csv(file_out, index = False)
+        
+    print("Export of dataset train : SUCCESS")
+
+
+print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
--- a/2_Regression_logistique.ipynb
+++ b/2_Regression_logistique.ipynb
--- a/2_Regression_logit+pipeline.ipynb
+++ b/2_Regression_logit+pipeline.ipynb
--- a/3_Modelling_Datasets.py
+++ b/3_Modelling_Datasets.py
@ -0,0 +1,68 @@
+# Business Data Challenge - Team 1
+
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+from datetime import date, timedelta, datetime
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+
+# Import KPI construction functions
+exec(open('utils_features_construction.py').read())
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+# functions
+def generate_test_set(type_of_comp):
+    file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Test_set")
+    test_set = pd.DataFrame()
+    for file in file_path_list:
+        print(file)
+        with fs.open(file, mode="rb") as file_in:
+            df = pd.read_csv(file_in, sep=",")
+        test_set = pd.concat([test_set, df], ignore_index = True)
+    return test_set
+
+
+def generate_train_set(type_of_comp):
+    file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Train_set")
+    train_set = pd.DataFrame()
+    for file in file_path_list:
+        print(file)
+        with fs.open(file, mode="rb") as file_in:
+            df = pd.read_csv(file_in, sep=",")
+        train_set = pd.concat([train_set, df], ignore_index = True)
+    return train_set
+
+
+type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/'
+
+# create test and train datasets
+test_set = generate_test_set(type_of_comp)
+train_set = generate_train_set(type_of_comp)
+
+# Exportation test set
+FILE_KEY_OUT_S3 = "Test_set.csv"
+FILE_PATH_OUT_S3 = BUCKET_OUT + FILE_KEY_OUT_S3
+
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    test_set.to_csv(file_out, index = False)
+
+print("Exportation dataset test : SUCCESS")
+
+# Exportation train set
+FILE_KEY_OUT_S3 = "Train_set.csv"
+FILE_PATH_OUT_S3 = BUCKET_OUT +  FILE_KEY_OUT_S3
+
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    train_set.to_csv(file_out, index = False)
+
+print("Exportation dataset train : SUCCESS")
--- a/4_Descriptive_Statistics.py
+++ b/4_Descriptive_Statistics.py
@ -0,0 +1,82 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import warnings
+from datetime import date, timedelta, datetime
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import seaborn as sns
+
+
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+exec(open('utils_features_construction.py').read())
+exec(open('utils_stat_desc.py').read())
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+companies = {'musee' : ['1', '2', '3', '4'], # , '101'
+            'sport': ['5', '6', '7', '8', '9'],
+            'musique' : ['10', '11', '12', '13', '14']}
+
+
+# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+for type_of_activity in ['musee', 'sport', 'musique'] :
+    
+    list_of_comp = companies[type_of_activity] 
+    
+    # Load files
+    customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
+    
+    # Identify anonymous customer for each company and remove them from our datasets
+    outlier_list = outlier_detection(tickets, list_of_comp)
+    
+    # Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
+    customer_valid_list = valid_customer_detection(products, campaigns_brut)
+    
+    databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
+    
+    for dataset in databases:
+        dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
+        dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
+        #print(f'shape of {dataset} : ', dataset.shape)
+    
+    # Identify customer who bought during the period of y
+    customer_target_period = identify_purchase_during_target_periode(products)
+    customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
+    
+    # Generate graph and automatically saved them in the bucket
+    compute_nb_clients(customer, type_of_activity)
+    
+    #maximum_price_paid(customer, type_of_activity)
+    
+    target_proportion(customer, type_of_activity)
+    
+    mailing_consent(customer, type_of_activity)
+    
+    mailing_consent_by_target(customer, type_of_activity)
+    
+    gender_bar(customer, type_of_activity)
+    
+    country_bar(customer, type_of_activity)
+    
+    lazy_customer_plot(campaigns_kpi, type_of_activity)
+    
+    campaigns_effectiveness(customer, type_of_activity)
+    
+    sale_dynamics(products, campaigns_brut, type_of_activity)
+    
+    tickets_internet(tickets, type_of_activity)
+    
+    already_bought_online(tickets, type_of_activity)
+    
+    box_plot_price_tickets(tickets, type_of_activity)
+    
+    target_description(targets, type_of_activity)
--- a/5_Modelling.py
+++ b/5_Modelling.py
@ -0,0 +1,87 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
+from sklearn.utils import class_weight
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.calibration import calibration_curve
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
+from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
+import pickle
+import warnings
+
+
+exec(open('utils_ml.py').read())
+
+warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
+warnings.filterwarnings("ignore", category=DataConversionWarning)
+
+# choose the type of companies for which you want to run the pipeline
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+# choose the type of model
+type_of_model = input('Choisissez le type de model : standard ? premium ?')
+
+# load train and test set
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
+
+X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
+
+print("Shape train : ", X_train.shape)
+print("Shape test : ", X_test.shape)
+
+# processing
+
+weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
+                                            y = y_train['y_has_purchased'])
+
+weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
+
+preproc = preprocess(type_of_model, type_of_activity)
+
+# Object for storing results
+model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
+
+# Naive Bayes
+model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Naive Bayes : Done")
+
+# Logistic Regression
+model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
+print("Logistic : Done")
+
+model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Logistic CV : Done")
+
+# Random Forest
+model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Random Forest : Done")
+
+model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Random Forest CV: Done")
+
+# Save result
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
--- a/6_Segmentation_and_Marketing_Personae.py
+++ b/6_Segmentation_and_Marketing_Personae.py
@ -0,0 +1,86 @@
+
+# Packages
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import pickle
+import warnings
+import matplotlib.pyplot as plt
+from tabulate import tabulate
+
+###################################
+
+# choose the model we use for the segmentation
+# model_name = "LogisticRegression_Benchmark"
+model_name = "LogisticRegression_cv"
+
+###################################
+
+
+# execute file including functions we need
+exec(open('utils_segmentation.py').read())
+
+warnings.filterwarnings('ignore')
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+
+# choose the type of companies for which you want to run the pipeline
+# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+for type_of_activity in ['musee', 'sport', 'musique'] : 
+    
+
+    # load test set
+    dataset_test = load_test_file(type_of_activity)
+    
+    # Load Model 
+    model = load_model(type_of_activity, model_name)
+    
+    
+    ### Preprocessing of data
+    X_test = dataset_test.drop(columns = 'y_has_purchased')
+    
+    y_test = dataset_test[['y_has_purchased']]
+    
+    X_test_segment = X_test
+    
+    # add y_has_purchased to X_test
+    X_test_segment["has_purchased"] = y_test
+    
+    # Add prediction and probability to dataset_test
+    y_pred = model.predict(X_test)
+    X_test_segment["has_purchased_estim"] = y_pred
+    
+    y_pred_prob = model.predict_proba(X_test)[:, 1]
+    X_test_segment['score'] = y_pred_prob
+    
+    X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
+                       np.where(X_test_segment['score']<0.5, '2',
+                       np.where(X_test_segment['score']<0.75, '3', '4')))
+    
+    ### 1. business KPIs 
+    
+    business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
+    X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
+    print(f"business figures for {type_of_activity} companies :\n")
+    print(X_test_business_fig)
+    print("\n")
+    
+    # save histogram to Minio
+    hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets", 
+                               "nb_purchases", "total_amount", "nb_campaigns", type_of_activity)
+    save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
+    
+    
+    ### 2. description of marketing personae 
+    ## A. Spider chart
+    radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity)
+    save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
+    
+    ## B. Latex table
+    known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity)
--- a/7_Sales_Forecast.py
+++ b/7_Sales_Forecast.py
@ -0,0 +1,112 @@
+# importations
+import pandas as pd
+from pandas import DataFrame
+import numpy as np
+import os
+import s3fs
+import matplotlib.pyplot as plt
+from scipy.optimize import fsolve
+import pickle
+import warnings
+import io
+
+
+# ignore warnings
+warnings.filterwarnings('ignore')
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+# importation of functions defined
+exec(open('utils_sales_forecast.py').read())
+# from utils_CA_segment import *
+
+# define type of activity 
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
+
+# type of model for the score
+type_of_model = "LogisticRegression_cv"
+# type_of_model = "LogisticRegression_Benchmark"
+
+# load train and test sets
+dataset_train, dataset_test = load_train_test(type_of_activity)
+
+# make features - define X train and X test
+X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
+
+# choose model - logit cross validated
+model = load_model(type_of_activity, type_of_model)
+
+# create table X test segment from X test
+X_test_segment = df_segment(X_test, y_test, model)
+
+# comparison with bias of the train set - X train to be defined
+X_train_score = model.predict_proba(X_train)[:, 1]
+
+bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
+                           y_objective = y_train["y_has_purchased"].sum(),
+                           initial_guess=10)
+print("Bias estimated :", np.log(bias_train_set))
+
+# create a score adjusted with the bias computed
+score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
+X_test_segment["score_adjusted"] = score_adjusted_train
+
+print("The score was successfully adjusted")
+MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
+MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
+print(f"MAE for score : {MAE_score}")
+print(f"MAE for adjusted score : {MAE_ajusted_score}")
+
+### 1. plot adjusted scores and save (to be tested)
+plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
+save_file_s3_ca("hist_score_adjusted_", type_of_activity)
+
+
+### 2. comparison between score and adjusted score
+X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
+X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
+
+print("Table of scores :\n")
+print(X_test_table_adjusted_scores)
+print("\n")
+
+# save table
+file_name = "table_adjusted_score_"
+FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    X_test_table_adjusted_scores.to_csv(file_out, index = False)
+
+
+# project revenue
+X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted", 
+                                     duration_ref=17, duration_projection=12)
+
+
+### 3. table summarizing projections (nb tickets, revenue)
+"""
+X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
+                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
+                    total_amount="total_amount", pace_purchase="pace_purchase"),2)
+                    """
+
+X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
+                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
+                    total_amount="total_amount_corrected", pace_purchase="pace_purchase"),2)
+
+# rename columns
+mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
+X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
+
+print("Summary of forecast :\n")
+print(X_test_expected_CA)
+print("\n")
+
+# save table
+file_name = "table_expected_CA_"
+FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    X_test_expected_CA.to_csv(file_out, index = False)
+
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
--- a/Notebook_Fanta.ipynb
+++ b/Notebook_Fanta.ipynb
@ -1,825 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
-   "metadata": {},
-   "source": [
-    "# Business Data Challenge - Team 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
-   "metadata": {},
-   "source": [
-    "Configuration de l'accès aux données"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "0b029d42-fb02-481e-a407-7e41886198a6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['bdc2324-data/1',\n",
-       " 'bdc2324-data/10',\n",
-       " 'bdc2324-data/101',\n",
-       " 'bdc2324-data/11',\n",
-       " 'bdc2324-data/12',\n",
-       " 'bdc2324-data/13',\n",
-       " 'bdc2324-data/14',\n",
-       " 'bdc2324-data/2',\n",
-       " 'bdc2324-data/3',\n",
-       " 'bdc2324-data/4',\n",
-       " 'bdc2324-data/5',\n",
-       " 'bdc2324-data/6',\n",
-       " 'bdc2324-data/7',\n",
-       " 'bdc2324-data/8',\n",
-       " 'bdc2324-data/9']"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import s3fs\n",
-    "# Create filesystem object\n",
-    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
-    "\n",
-    "BUCKET = \"bdc2324-data\"\n",
-    "fs.ls(BUCKET)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement des fichiers campaign_stats.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Conversion des dates 'sent_at'\n",
-    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2023-11-09 18:10:45+00:00\n",
-      "2020-06-02 08:24:08+00:00\n",
-      "2023-10-12 01:39:48+00:00\n",
-      "2023-10-10 17:06:29+00:00\n",
-      "2023-11-01 09:20:48+00:00\n",
-      "2021-03-31 14:59:02+00:00\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
-    "print(campaign_stats_1['sent_at'].max())\n",
-    "print(campaign_stats_1['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_2['sent_at'].max())\n",
-    "print(campaign_stats_2['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_3['sent_at'].max())\n",
-    "print(campaign_stats_3['sent_at'].min())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "c75632df-b018-4bb8-a99d-83f15af94369",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0         2021-03-28 16:01:09+00:00\n",
-       "1         2021-03-28 16:01:09+00:00\n",
-       "2         2021-03-28 16:00:59+00:00\n",
-       "3         2021-03-28 16:00:59+00:00\n",
-       "4         2021-03-28 16:01:06+00:00\n",
-       "                     ...           \n",
-       "6214803   2023-10-23 09:32:33+00:00\n",
-       "6214804   2023-10-23 09:32:49+00:00\n",
-       "6214805   2023-10-23 09:33:28+00:00\n",
-       "6214806   2023-10-23 09:31:53+00:00\n",
-       "6214807   2023-10-23 09:33:54+00:00\n",
-       "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "campaign_stats_1['sent_at']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
-   "metadata": {},
-   "source": [
-    "### Customersplus.csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
-     ]
-    }
-   ],
-   "source": [
-    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
-       "       'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
-       "       'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
-       "       'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
-       "       'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
-       "       'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
-       "       'average_purchase_delay', 'average_price_basket',\n",
-       "       'average_ticket_basket', 'total_price', 'preferred_category',\n",
-       "       'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
-       "       'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
-       "       'tenant_id'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "customers_plus_1.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08091935-b159-47fa-806c-e1444f3b227e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_2['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1425c385-3216-4e4f-ae8f-a121624721ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 61,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
-    "# Exemple id commun = caractéristiques communes\n",
-    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
-    "\n",
-    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "id                          0.000000\n",
-      "lastname                   43.461341\n",
-      "firstname                  44.995588\n",
-      "birthdate                  96.419870\n",
-      "email                       8.622075\n",
-      "street_id                   0.000000\n",
-      "created_at                  0.000000\n",
-      "updated_at                  0.000000\n",
-      "civility                  100.000000\n",
-      "is_partner                  0.000000\n",
-      "extra                     100.000000\n",
-      "deleted_at                100.000000\n",
-      "reference                 100.000000\n",
-      "gender                      0.000000\n",
-      "is_email_true               0.000000\n",
-      "extra_field               100.000000\n",
-      "identifier                  0.000000\n",
-      "opt_in                      0.000000\n",
-      "structure_id               88.072380\n",
-      "note                       99.403421\n",
-      "profession                 95.913503\n",
-      "language                   99.280945\n",
-      "mcp_contact_id             34.876141\n",
-      "need_reload                 0.000000\n",
-      "last_buying_date           51.653431\n",
-      "max_price                  51.653431\n",
-      "ticket_sum                  0.000000\n",
-      "average_price               8.639195\n",
-      "fidelity                    0.000000\n",
-      "average_purchase_delay     51.653431\n",
-      "average_price_basket       51.653431\n",
-      "average_ticket_basket      51.653431\n",
-      "total_price                43.014236\n",
-      "preferred_category        100.000000\n",
-      "preferred_supplier        100.000000\n",
-      "preferred_formula         100.000000\n",
-      "purchase_count              0.000000\n",
-      "first_buying_date          51.653431\n",
-      "last_visiting_date        100.000000\n",
-      "zipcode                    71.176564\n",
-      "country                     5.459418\n",
-      "age                        96.419870\n",
-      "tenant_id                   0.000000\n",
-      "dtype: float64\n"
-     ]
-    }
-   ],
-   "source": [
-    "pd.DataFrame(customers_plus_1.isna().mean()*100)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement de toutes les données\n",
-    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
-    "\n",
-    "for nom_base in liste_base:\n",
-    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
-    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>lastname</th>\n",
-       "      <th>firstname</th>\n",
-       "      <th>birthdate</th>\n",
-       "      <th>email</th>\n",
-       "      <th>street_id</th>\n",
-       "      <th>created_at</th>\n",
-       "      <th>updated_at</th>\n",
-       "      <th>civility</th>\n",
-       "      <th>is_partner</th>\n",
-       "      <th>...</th>\n",
-       "      <th>tenant_id</th>\n",
-       "      <th>id_x</th>\n",
-       "      <th>customer_id</th>\n",
-       "      <th>purchase_date</th>\n",
-       "      <th>type_of</th>\n",
-       "      <th>is_from_subscription</th>\n",
-       "      <th>amount</th>\n",
-       "      <th>is_full_price</th>\n",
-       "      <th>start_date_time</th>\n",
-       "      <th>event_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>405082</td>\n",
-       "      <td>lastname405082</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>992423</td>\n",
-       "      <td>405082</td>\n",
-       "      <td>2023-01-11 17:08:41+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>13.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-02-06 20:00:00+01:00</td>\n",
-       "      <td>zaide</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>405082</td>\n",
-       "      <td>lastname405082</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>992423</td>\n",
-       "      <td>405082</td>\n",
-       "      <td>2023-01-11 17:08:41+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>13.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-02-06 20:00:00+01:00</td>\n",
-       "      <td>zaide</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>411168</td>\n",
-       "      <td>lastname411168</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1053934</td>\n",
-       "      <td>411168</td>\n",
-       "      <td>2023-03-16 16:23:10+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>62.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-03-19 16:00:00+01:00</td>\n",
-       "      <td>luisa miller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>411168</td>\n",
-       "      <td>lastname411168</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1053934</td>\n",
-       "      <td>411168</td>\n",
-       "      <td>2023-03-16 16:23:10+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>62.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-03-19 16:00:00+01:00</td>\n",
-       "      <td>luisa miller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4380</td>\n",
-       "      <td>lastname4380</td>\n",
-       "      <td>firstname4380</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2021-04-22 14:51:55.432952+02:00</td>\n",
-       "      <td>2022-04-14 11:41:33.738500+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1189141</td>\n",
-       "      <td>4380</td>\n",
-       "      <td>2020-11-26 13:12:53+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>51.3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-12-01 20:00:00+01:00</td>\n",
-       "      <td>iphigenie en tauride</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318964</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318965</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318966</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318967</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1244277</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-12-31 11:04:07+01:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-02-03 20:00:00+01:00</td>\n",
-       "      <td>a boire et a manger</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318968</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1244277</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-12-31 11:04:07+01:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-02-03 20:00:00+01:00</td>\n",
-       "      <td>a boire et a manger</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>318969 rows × 52 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id        lastname       firstname   birthdate       email  \\\n",
-       "0       405082  lastname405082             NaN         NaN         NaN   \n",
-       "1       405082  lastname405082             NaN         NaN         NaN   \n",
-       "2       411168  lastname411168             NaN         NaN         NaN   \n",
-       "3       411168  lastname411168             NaN         NaN         NaN   \n",
-       "4         4380    lastname4380   firstname4380         NaN         NaN   \n",
-       "...        ...             ...             ...         ...         ...   \n",
-       "318964   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318965   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318966   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318967   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318968   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "\n",
-       "        street_id                        created_at  \\\n",
-       "0               6  2023-01-12 06:30:31.197484+01:00   \n",
-       "1               6  2023-01-12 06:30:31.197484+01:00   \n",
-       "2               6  2023-03-17 06:30:35.431967+01:00   \n",
-       "3               6  2023-03-17 06:30:35.431967+01:00   \n",
-       "4               1  2021-04-22 14:51:55.432952+02:00   \n",
-       "...           ...                               ...   \n",
-       "318964          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318965          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318966          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318967          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318968          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "\n",
-       "                              updated_at  civility  is_partner  ...  \\\n",
-       "0       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
-       "1       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
-       "2       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
-       "3       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
-       "4       2022-04-14 11:41:33.738500+02:00       NaN       False  ...   \n",
-       "...                                  ...       ...         ...  ...   \n",
-       "318964  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318965  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318966  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318967  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318968  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "\n",
-       "        tenant_id     id_x  customer_id              purchase_date  type_of  \\\n",
-       "0            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
-       "1            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
-       "2            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
-       "3            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
-       "4            1556  1189141         4380  2020-11-26 13:12:53+01:00        3   \n",
-       "...           ...      ...          ...                        ...      ...   \n",
-       "318964       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318965       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318966       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318967       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
-       "318968       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
-       "\n",
-       "        is_from_subscription amount  is_full_price            start_date_time  \\\n",
-       "0                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
-       "1                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
-       "2                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
-       "3                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
-       "4                      False   51.3          False  2020-12-01 20:00:00+01:00   \n",
-       "...                      ...    ...            ...                        ...   \n",
-       "318964                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318965                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318966                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318967                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
-       "318968                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
-       "\n",
-       "                  event_name  \n",
-       "0                      zaide  \n",
-       "1                      zaide  \n",
-       "2               luisa miller  \n",
-       "3               luisa miller  \n",
-       "4       iphigenie en tauride  \n",
-       "...                      ...  \n",
-       "318964          entre femmes  \n",
-       "318965          entre femmes  \n",
-       "318966          entre femmes  \n",
-       "318967   a boire et a manger  \n",
-       "318968   a boire et a manger  \n",
-       "\n",
-       "[318969 rows x 52 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Jointure\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
-    "df_customer_event"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/README.md
+++ b/README.md
@ -0,0 +1,69 @@
+# Business data challenge 2023-2024 | ENSAE Paris 
+
+
+# Arenametrix : customer segmentation
+
+<p align="center">
+    <img src="https://dev.arenametrix.fr/assets/logo_ax-806e8204f49bcc2c5e8cd34e9748d16a6038404e37fdb2dc9d61455bb06c6461.png" width=300>
+</p>
+
+
+
+## Team 1 
+
+* Antoine JOUBREL
+* Alexis REVELLE
+* Fanta RODRIGUE
+* Thomas PIQUÉ
+
+
+## Coaches 
+
+* Elia LAPENTA 
+* Michael VISSER
+
+## Support team
+
+* Patrice MICHEL (Datastorm)
+* Hassan MAISSORO (Datastorm)
+* Alexandre PRINC (Arenametrix)
+
+## Microeconomics coordinator
+
+* Yuanzhe TANG
+
+
+### Description of the problematic
+The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies). 
+
+### More detailled instructions provided by Arenamtrix 
+- Definition of “marketing personae” that can be match with a probability to buy a future event
+- Matching between future event and people in the database (with for instance a probability to buy a future event)
+- And thus, a forecast of the quantity of ticket sold by event by “marketing personae” or by a segment of the database
+- BONUS : What is the best timing to send a communication to each contact in the database and each “marketing personae”
+- BONUS : What should we tell to each contact in the database and each “marketing personae”to make them come back
+
+
+### Our approach
+We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
+As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year. 
+
+### How run the code 
+Codes have to be run in an order following their numbers. Each of them is described below : 
+
+- `1_Input_cleaning.py` \
+Clean raw data and generate dataframes that will be used to build datasets with insightful variables. Datasets are exported to location 0_Input/.
+- `2_Datasets_generation.py` \
+Use dataframes previously created and aggregate them to create test and train set for each company. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/ in a folder containing all 5 databases for a type of activity.
+- `3_Modelling_datasets.py` \
+For each type of activity, the test and train sets of the 5 tenants are concatenated. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/.
+- `4_Descriptive_statistics.py` \
+Generate graphics providing some descriptive statistics about the data at the activity level. All graphics are exported to location 2_Output/2_0_Descriptive_Statistics/.
+- `5_Modelling.py` \
+3 ML models will be fitted on the data, and results will be exported for all 3 types of activities. \
+3 pipelines are built, one by type of model (Naive Bayes, Random Forest, Logistic Regression). For the 2 latter ML methods, cross validation was performed to ensure generalization. Graphics displaying the quality of the training are provided. Optimal parameters found are saved in a pickle file (which will be used in the 6th step to add propensity scores to the test set and then determine the segments of the customers). All these files are exported to location 2_Output/2_1_Modeling_results/
+- `6_Segmentation_and_Marketing_Personae.py` \
+The test set will be fitted with the optimal parameters computed previously, and a propensity score (probability of a future purchase) will be assigned to each customer of this dataset. Segmentation is performed according to the scores provided. Graphics describing the marketing personae associated to the segments as well as their business value are exported to location 2_Output/2_2_Segmentation_and_Marketing_Personae/. 
+- `7_Sales_Forecast.py` \
+To ensure a decent recall, and because of the unbalancing of the target variable y (the global probability of purchase is between 4 and 14 %), the probabilities of purchasing are overestimated.The scores will therefore be adjusted so that their mean approximates the overall probability of a purchase. This score adjusted is used to estimate, for each customer, the number of tickets sold and the revenue generated during the incoming year. Results are aggregated at segment level. A histogram displaying the adjusted propensity scores and 2 tables summarizing the forecast outcome are exported to location 2_Output/2_3_Sales_Forecast/. 
+
--- a/TP_access_merge_data.ipynb
+++ b/TP_access_merge_data.ipynb
--- a/TP_exploratory_analysis-Copy1.ipynb
+++ b/TP_exploratory_analysis-Copy1.ipynb
--- a/TP_merge_tables_clean.ipynb
+++ b/TP_merge_tables_clean.ipynb
--- a/Temporary_barplot_example_TP.ipynb
+++ b/Temporary_barplot_example_TP.ipynb
--- a/Traitement_Fanta.ipynb
+++ b/Traitement_Fanta.ipynb
--- a/all_packages_versions.txt
+++ b/all_packages_versions.txt
@ -0,0 +1,249 @@
+Package                   Version
+------------------------- ---------------
+aiohttp                   3.9.1
+aiosignal                 1.3.1
+alembic                   1.13.1
+anyio                     4.2.0
+archspec                  0.2.2
+argon2-cffi               23.1.0
+argon2-cffi-bindings      21.2.0
+arrow                     1.3.0
+astroid                   3.0.2
+asttokens                 2.4.1
+async-lru                 2.0.4
+attrs                     23.2.0
+Babel                     2.14.0
+bcrypt                    4.1.2
+beautifulsoup4            4.12.3
+bleach                    6.1.0
+blinker                   1.7.0
+bokeh                     3.3.4
+boltons                   23.1.1
+boto3                     1.34.29
+botocore                  1.34.29
+branca                    0.7.0
+Brotli                    1.1.0
+cached-property           1.5.2
+cachetools                5.3.2
+certifi                   2023.11.17
+cffi                      1.16.0
+charset-normalizer        3.3.2
+click                     8.1.7
+click-plugins             1.1.1
+cligj                     0.7.2
+cloudpickle               3.0.0
+colorama                  0.4.6
+comm                      0.2.1
+conda                     23.11.0
+conda-libmamba-solver     23.12.0
+conda-package-handling    2.2.0
+conda_package_streaming   0.9.0
+configparser              5.3.0
+contourpy                 1.2.0
+cryptography              41.0.7
+cycler                    0.12.1
+cytoolz                   0.12.2
+dask                      2024.1.1
+databricks-cli            0.18.0
+debugpy                   1.8.0
+decorator                 5.1.1
+defusedxml                0.7.1
+dill                      0.3.8
+distributed               2024.1.1
+distro                    1.8.0
+docker                    7.0.0
+duckdb                    0.9.2
+entrypoints               0.4
+exceptiongroup            1.2.0
+executing                 2.0.1
+fastjsonschema            2.19.1
+fiona                     1.9.5
+flake8                    7.0.0
+Flask                     3.0.1
+folium                    0.15.1
+fonttools                 4.47.2
+fqdn                      1.5.1
+frozenlist                1.4.1
+fsspec                    2023.12.2
+GDAL                      3.8.3
+gensim                    4.3.2
+geopandas                 0.14.2
+gitdb                     4.0.11
+GitPython                 3.1.41
+google-auth               2.27.0
+greenlet                  3.0.3
+gunicorn                  21.2.0
+hvac                      2.1.0
+idna                      3.6
+importlib-metadata        7.0.1
+importlib-resources       6.1.1
+ipykernel                 6.29.0
+ipython                   8.20.0
+ipywidgets                8.1.1
+isoduration               20.11.0
+isort                     5.13.2
+itsdangerous              2.1.2
+jedi                      0.19.1
+Jinja2                    3.1.3
+jmespath                  1.0.1
+joblib                    1.3.2
+json5                     0.9.14
+jsonpatch                 1.33
+jsonpointer               2.4
+jsonschema                4.21.1
+jsonschema-specifications 2023.12.1
+jupyter-cache             1.0.0
+jupyter_client            8.6.0
+jupyter_core              5.7.1
+jupyter-events            0.9.0
+jupyter-lsp               2.2.2
+jupyter_server            2.12.5
+jupyter-server-mathjax    0.2.6
+jupyter_server_terminals  0.5.2
+jupyterlab                4.0.11
+jupyterlab_git            0.50.0
+jupyterlab_pygments       0.3.0
+jupyterlab_server         2.25.2
+jupyterlab-widgets        3.0.9
+kiwisolver                1.4.5
+kubernetes                29.0.0
+libmambapy                1.5.5
+llvmlite                  0.41.1
+locket                    1.0.0
+lz4                       4.3.3
+Mako                      1.3.1
+mamba                     1.5.5
+mapclassify               2.6.1
+Markdown                  3.5.2
+MarkupSafe                2.1.4
+matplotlib                3.8.2
+matplotlib-inline         0.1.6
+mccabe                    0.7.0
+menuinst                  2.0.2
+mistune                   3.0.2
+mlflow                    2.10.0
+msgpack                   1.0.7
+multidict                 6.0.4
+munkres                   1.1.4
+mypy                      1.8.0
+mypy-extensions           1.0.0
+nbclient                  0.8.0
+nbconvert                 7.14.2
+nbdime                    4.0.1
+nbformat                  5.9.2
+nest_asyncio              1.6.0
+networkx                  3.2.1
+nltk                      3.8.1
+notebook_shim             0.2.3
+numba                     0.58.1
+numpy                     1.26.3
+oauthlib                  3.2.2
+opencv-python-headless    4.9.0.80
+overrides                 7.7.0
+packaging                 23.2
+pandas                    2.2.0
+pandocfilters             1.5.0
+paramiko                  3.4.0
+parso                     0.8.3
+partd                     1.4.1
+patsy                     0.5.6
+pexpect                   4.9.0
+pickleshare               0.7.5
+pillow                    10.2.0
+pip                       23.3.2
+pkgutil_resolve_name      1.3.10
+platformdirs              4.1.0
+plotly                    5.18.0
+pluggy                    1.3.0
+polars                    0.20.6
+prometheus-client         0.19.0
+prometheus-flask-exporter 0.23.0
+prompt-toolkit            3.0.42
+protobuf                  4.24.4
+psutil                    5.9.8
+ptyprocess                0.7.0
+pure-eval                 0.2.2
+pyarrow                   14.0.2
+pyarrow-hotfix            0.6
+pyasn1                    0.5.1
+pyasn1-modules            0.3.0
+pycodestyle               2.11.1
+pycosat                   0.6.6
+pycparser                 2.21
+pyflakes                  3.2.0
+Pygments                  2.17.2
+PyJWT                     2.8.0
+pylint                    3.0.3
+PyNaCl                    1.5.0
+pyOpenSSL                 23.3.0
+pyparsing                 3.1.1
+pyproj                    3.6.1
+PySocks                   1.7.1
+python-dateutil           2.8.2
+python-json-logger        2.0.7
+pytz                      2023.3.post1
+pyu2f                     0.1.5
+PyYAML                    6.0.1
+pyzmq                     25.1.2
+querystring-parser        1.2.4
+referencing               0.32.1
+regex                     2023.12.25
+requests                  2.31.0
+requests-oauthlib         1.3.1
+rfc3339-validator         0.1.4
+rfc3986-validator         0.1.1
+rpds-py                   0.17.1
+rsa                       4.9
+Rtree                     1.2.0
+ruamel.yaml               0.18.5
+ruamel.yaml.clib          0.2.7
+s3fs                      0.4.2
+s3transfer                0.10.0
+scikit-learn              1.4.0
+scipy                     1.12.0
+seaborn                   0.13.2
+Send2Trash                1.8.2
+setuptools                68.2.2
+shapely                   2.0.2
+six                       1.16.0
+smart-open                6.4.0
+smmap                     5.0.0
+sniffio                   1.3.0
+sortedcontainers          2.4.0
+soupsieve                 2.5
+SQLAlchemy                2.0.25
+sqlparse                  0.4.4
+stack-data                0.6.2
+statsmodels               0.14.1
+tabulate                  0.9.0
+tblib                     3.0.0
+tenacity                  8.2.3
+terminado                 0.18.0
+threadpoolctl             3.2.0
+tinycss2                  1.2.1
+tomli                     2.0.1
+tomlkit                   0.12.3
+toolz                     0.12.1
+tornado                   6.3.3
+tqdm                      4.66.1
+traitlets                 5.14.1
+truststore                0.8.0
+types-python-dateutil     2.8.19.20240106
+typing_extensions         4.9.0
+typing-utils              0.1.0
+tzdata                    2023.4
+uri-template              1.3.0
+urllib3                   1.26.18
+wcwidth                   0.2.13
+webcolors                 1.13
+webencodings              0.5.1
+websocket-client          1.7.0
+Werkzeug                  3.0.1
+wheel                     0.42.0
+widgetsnbextension        4.0.9
+xgboost                   2.0.3
+xyzservices               2023.10.1
+yarl                      1.9.4
+zict                      3.0.0
+zipp                      3.17.0
+zstandard                 0.22.0
--- a/manquante.ipynb
+++ b/manquante.ipynb
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@ -74,53 +74,11 @@ def preprocessing_customerplus(directory_path):
    cleaning_date(customerplus_copy, 'last_visiting_date')
    
    # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)

    return customerplus_copy

-def preprocessing_tickets_area(directory_path):
-
-    # Datasets loading
-    tickets = load_dataset(directory_path, name = "tickets")
-    purchases = load_dataset(directory_path, name = "purchases")
-    suppliers = load_dataset(directory_path, name = "suppliers")
-    type_ofs = load_dataset(directory_path, name = "type_ofs")
-    
-    # Base des tickets
-    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
-    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
-
-    # Base des fournisseurs
-    suppliers = suppliers[['id', 'name']]
-    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
-    suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
-
-    # Base des types de billets
-    type_ofs = type_ofs[['id', 'name', 'children']]
-    type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
-
-    # Base des achats
-    # Nettoyage de la date d'achat
-    # cleaning_date(purchases, 'purchase_date')
-    # Selection des variables
-    purchases = purchases[['id', 'purchase_date', 'customer_id']]
-
-    # Fusions 
-    # Fusion avec fournisseurs
-    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
-    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
-    
-    # Fusion avec type de tickets
-    ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
-    ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
-    
-    # Fusion avec achats
-    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
-    ticket_information.drop(['id'], axis = 1, inplace=True)
-
-    return ticket_information
-
 def preprocessing_target_area(directory_path):

    # Datasets loading
@ -169,6 +127,69 @@ def preprocessing_campaigns_area(directory_path):

    return campaigns_full

+def preprocessing_tickets_area(directory_path):
+
+    # Datasets loading
+    tickets = load_dataset(directory_path, name = "tickets")
+
+    # Supplementary tickets dataset for tenant 101
+    if directory_path == '101':
+        tickets_1 = load_dataset(directory_path, name = "tickets_1")
+    
+    purchases = load_dataset(directory_path, name = "purchases")
+    suppliers = load_dataset(directory_path, name = "suppliers")
+    # type_ofs = load_dataset(directory_path, name = "type_ofs")
+    
+    # Base des tickets
+    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
+    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
+    
+    if directory_path == '101':
+        tickets_1 = tickets_1[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
+        tickets_1.rename(columns = {'id' : 'ticket_id'}, inplace = True)
+
+    # Base des fournisseurs
+    suppliers = suppliers[['id', 'name']]
+    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
+    suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
+
+    # Base des types de billets
+    # type_ofs = type_ofs[['id', 'name', 'children']]
+    # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
+
+    # Base des achats
+    # Nettoyage de la date d'achat
+    # cleaning_date(purchases, 'purchase_date')
+    
+    # Selection des variables
+    purchases = purchases[['id', 'purchase_date', 'customer_id']]
+
+    # Fusions 
+    # Fusion avec fournisseurs
+    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
+    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
+    
+    # Fusion avec type de tickets
+    # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
+    # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
+    
+    # Fusion avec achats
+    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
+    ticket_information.drop(['id'], axis = 1, inplace=True)
+
+    if directory_path == '101':
+        # Fusion avec fournisseurs
+        ticket_information_1 = pd.merge(tickets_1, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
+        ticket_information_1.drop(['supplier_id', 'id'], axis = 1, inplace=True)
+        
+        # Fusion avec achats
+        ticket_information_1 = pd.merge(ticket_information_1, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
+        ticket_information_1.drop(['id'], axis = 1, inplace=True)
+
+        return ticket_information, ticket_information_1
+    else :
+        return ticket_information
+    
 def create_products_table(directory_path):
    # first merge products and categories
    print("first merge products and categories")
@ -179,23 +200,21 @@ def create_products_table(directory_path):
    categories = categories.drop(columns = ['extra_field', 'quota'])

    #Merge
-    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
-                                    right_on = 'id', suffixes=('_products', '_categories'))
+    products_theme = products.merge(categories, how = 'left', left_on = 'category_id', right_on = 'id', suffixes=('_products', '_categories'))
    products_theme = products_theme.rename(columns = {"name" : "name_categories"})
    
    # Second merge products_theme and type of categories
-    print("Second merge products_theme and type of categories")
-    type_of_categories = load_dataset(directory_path, name = "type_of_categories")
-    type_of_categories = type_of_categories.drop(columns = 'id')
-    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
-                                          right_on = 'category_id' )
+    # print("Second merge products_theme and type of categories")
+    # type_of_categories = load_dataset(directory_path, name = "type_of_categories")
+    # type_of_categories = type_of_categories.drop(columns = 'id')
+    # products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
+    #                                       right_on = 'category_id' )

    # Index cleaning
    products_theme = products_theme.drop(columns = ['id_categories'])
    products_theme  = order_columns_id(products_theme)
    return products_theme

-
 def create_events_table(directory_path):
    # first merge events and seasons : 
    print("first merge events and seasons : ")
@ -233,16 +252,12 @@ def create_events_table(directory_path):

 def create_representations_table(directory_path):
    representations = load_dataset(directory_path, name = "representations")
-    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
-                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
-                                                     'representation_type_id'])
+    representations = representations.drop(columns = ['serial', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'name', 'representation_type_id']) # 'start_date_time', 'end_date_time', 'open'
    
    representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])

-    representations_theme = representations.merge(representations_capacity, how='left',
-                                                  left_on='id', right_on='representation_id',
-                                                  suffixes=('_representation', '_representation_cap'))
+    representations_theme = representations.merge(representations_capacity, how='left', left_on='id', right_on='representation_id', suffixes=('_representation', '_representation_cap'))
    # index cleaning
    representations_theme = representations_theme.drop(columns = ["id_representation"])
    representations_theme = order_columns_id(representations_theme)
@ -255,20 +270,29 @@ def uniform_product_df(directory_path):
    products_theme = create_products_table(directory_path)
    representation_theme = create_representations_table(directory_path)
    events_theme = create_events_table(directory_path)
-    ticket_information = preprocessing_tickets_area(directory_path)
+
+    if directory_path == '101':
+        ticket_information, ticket_information_1  = preprocessing_tickets_area(directory_path)
+    else : 
+        ticket_information = preprocessing_tickets_area(directory_path)

    print("Products theme columns : ", products_theme.columns)
    print("\n Representation theme columns : ", representation_theme.columns)
    print("\n Events theme columns : ", events_theme.columns)

-    products_global = pd.merge(products_theme, representation_theme, how='left',
-                                           on= ["representation_id", "category_id"])
+    products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"])
    
-    products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
-                                            suffixes = ("_representation", "_event"))
+    products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event"))
    
    products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
    
-    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
+    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date',  'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
    
-    return products_purchased_reduced
+    if directory_path == '101':
+        products_purchased_1 = pd.merge(ticket_information_1, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
+        
+        products_purchased_reduced_1 = products_purchased_1[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date',  'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
+        
+        return products_purchased_reduced, products_purchased_reduced_1
+    else : 
+        return products_purchased_reduced
--- a/utils_features_construction.py
+++ b/utils_features_construction.py
@ -0,0 +1,165 @@
+# Function de construction de KPI
+
+def custom_date_parser(date_string):
+    return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
+
+def display_input_databases(directory_path, file_name, datetime_col = None):
+    """
+    This function returns the file from s3 storage 
+    """
+    file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
+    print("File path : ", file_path)
+    with fs.open(file_path, mode="rb") as file_in:
+        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
+    return df       
+
+def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
+    
+    # Nombre de campagnes de mails
+    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
+    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
+    
+    # Temps d'ouverture moyen (en minutes)
+    campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
+    campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
+    
+    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
+
+    # Nombre de mail ouvert
+    opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
+    opened_campaign.dropna(subset=['opened_at'], inplace=True)
+    opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
+    opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)
+
+    # Fusion des indicateurs
+    campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
+    campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
+
+    # Taux de mails ouvert
+    campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns']
+    
+    # Fill NaN values
+    campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
+    # Remplir les NaT : time_to_open (??)
+
+    return campaigns_reduced
+
+def tickets_kpi_function(tickets_information = None):
+
+    tickets_information_copy = tickets_information.copy()
+    
+    # Dummy : Canal de vente en ligne
+    liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
+    tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)
+
+    # Proportion de vente en ligne
+    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index()
+    prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True)
+
+    # Mixte KPI comportement achat
+    tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
+                   .groupby(['customer_id']) 
+                   .agg(nb_tickets=('ticket_id', 'nunique'), 
+                        nb_purchases=('purchase_id', 'nunique'),
+                        total_amount=('amount', 'sum'),
+                        nb_suppliers=('supplier_name', 'nunique'),
+                        achat_internet=('vente_internet', 'max'),
+                        purchase_date_min=('purchase_date', 'min'),
+                        purchase_date_max=('purchase_date', 'max'))
+                   .reset_index())
+        
+    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
+    tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
+
+    # Convertir date et en chiffre
+    max_date = tickets_kpi['purchase_date_max'].max()
+    tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
+    tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
+    
+    # Proportion de ticket internet
+    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
+    tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
+    tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
+
+    # Nombre d'achat à choisir
+    tickets_information_copy['month_year_purchase'] = 'purchases_' + tickets_information_copy['purchase_date'].dt.month.astype(str) + '_' + tickets_information_copy['purchase_date'].dt.year.astype(str)
+    purchases_by_month = tickets_information_copy.pivot_table(index='customer_id', columns='month_year_purchase', values='purchase_id', aggfunc='nunique', fill_value=0)
+    tickets_kpi = pd.merge(tickets_kpi, purchases_by_month, on = 'customer_id', how = 'left')
+    
+    return tickets_kpi
+
+def customerplus_kpi_function(customerplus_clean = None):
+    # KPI sur les données socio-demographique   
+    
+    # Le genre
+    customerplus_clean["gender_label"] = customerplus_clean["gender"].map({
+        0: 'female',
+        1: 'male',
+        2: 'other'
+    })
+    gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
+    customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
+    customerplus_clean.drop(columns = "gender", inplace = True)
+
+
+    # Age
+    customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
+    customerplus_clean['categorie_age_10_20'] = ((customerplus_clean['age'] >= 10) & (customerplus_clean['age'] < 20)).astype(int)
+    customerplus_clean['categorie_age_20_30'] = ((customerplus_clean['age'] >= 20) & (customerplus_clean['age'] < 30)).astype(int)
+    customerplus_clean['categorie_age_30_40'] = ((customerplus_clean['age'] >= 30) & (customerplus_clean['age'] < 40)).astype(int)
+    customerplus_clean['categorie_age_40_50'] = ((customerplus_clean['age'] >= 40) & (customerplus_clean['age'] < 50)).astype(int)
+    customerplus_clean['categorie_age_50_60'] = ((customerplus_clean['age'] >= 50) & (customerplus_clean['age'] < 60)).astype(int)
+    customerplus_clean['categorie_age_60_70'] = ((customerplus_clean['age'] >= 60) & (customerplus_clean['age'] < 70)).astype(int)
+    customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
+    customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
+    customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
+    # customerplus_clean.drop(columns = "age", inplace = True)
+
+    # Consentement au mailing
+    customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
+    
+    # Indicatrice si individue vit en France
+    customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
+    # customerplus_clean.drop(columns = "country", inplace = True)
+    
+    customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
+    # customerplus_clean.drop(columns = "profession", inplace = True)
+
+    customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
+    # customerplus_clean.drop(columns = "zipcode", inplace = True)
+    
+    
+    return customerplus_clean
+
+def targets_KPI(df_target = None):
+    
+    df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
+
+    # Target name cotegory musees / 
+    df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
+    df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
+    df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
+    df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
+    df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
+    df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
+    df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
+    
+    # Target name category for sport compagnies
+    df_target['target_abonne'] = ((
+                            df_target['target_name']
+                            .str.contains('|'.join(['abo', 'adh']), case=False)
+                            & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
+                            ).astype(int))
+    
+    df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
+    
+    target_agg = df_target.groupby('customer_id').agg(
+        nb_targets=('target_name', 'nunique')  # Utilisation de tuples pour spécifier les noms de colonnes
+        # all_targets=('target_name', concatenate_names),
+        # all_target_types=('target_type_name', concatenate_names)
+        ).reset_index()
+    
+    target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
+    
+    return target_agg
+    
--- a/utils_ml.py
+++ b/utils_ml.py
@ -0,0 +1,425 @@
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import io
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
+from sklearn.utils import class_weight
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.calibration import calibration_curve
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
+from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
+
+import pickle
+import warnings
+
+
+def load_train_test(type_of_activity, type_of_model):
+    BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
+    File_path_train = BUCKET + "/Train_set.csv"
+    File_path_test = BUCKET + "/Test_set.csv"
+    
+    with fs.open( File_path_train, mode="rb") as file_in:
+        dataset_train = pd.read_csv(file_in, sep=",")
+        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
+
+    with fs.open(File_path_test, mode="rb") as file_in:
+        dataset_test = pd.read_csv(file_in, sep=",")
+        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
+
+    if type_of_model=='premium':
+        dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
+        dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
+        dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
+        dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
+    return dataset_train, dataset_test
+
+
+def save_file_s3(File_name, type_of_activity, type_of_model, model):
+    """
+    save plot into s3 storage
+    """
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png')
+    image_buffer.seek(0)
+    FILE_PATH = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+
+def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
+    """
+    save result into s3 storage
+    """
+    if model_path:
+        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
+    else:
+        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
+    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+        result_set.to_csv(file_out, index = False)
+
+
+def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
+    """
+    save model into pickle file
+    """
+    model_bytes = pickle.dumps(classifier)
+    FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
+        f.write(model_bytes)
+    
+
+def compute_recall(group):
+    return recall_score(group['y_has_purchased'], group['prediction'])
+    
+
+def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
+    test = dataset_test.copy()
+    test['prediction'] = y_pred
+    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
+    recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
+    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
+
+
+def features_target_split(dataset_train, dataset_test):
+    """
+    return train and test set
+    """
+    features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
+                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
+                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
+                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
+                  'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
+                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
+                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
+                 'target_jeune', 'target_abonne']
+    X_train = dataset_train[features_l]
+    y_train = dataset_train[['y_has_purchased']]
+
+    X_test = dataset_test[features_l]
+    y_test = dataset_test[['y_has_purchased']]
+    return X_train, X_test, y_train, y_test
+
+
+def preprocess(type_of_model,  type_of_activity):
+    """
+    preprocess variables before running machine learning pipeline
+    """
+    
+    numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
+                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
+                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
+                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
+
+    binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
+                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
+                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
+    
+    if type_of_activity=='musee':
+        numeric_features.remove('time_to_open')
+    
+    if type_of_model=='premium':
+        if type_of_activity=='musique':
+            binary_features.extend(['target_optin', 'target_newsletter'])
+        elif type_of_activity=='sport':
+            binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
+        else:
+             binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
+       
+
+    numeric_transformer = Pipeline(steps=[
+        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
+        ("scaler", StandardScaler()) 
+    ])
+
+    binary_transformer = Pipeline(steps=[
+        ("imputer", SimpleImputer(strategy="most_frequent")),  
+    ])
+    preproc = ColumnTransformer(
+        transformers=[
+            ("num", numeric_transformer, numeric_features),
+            ("bin", binary_transformer, binary_features)
+        ]
+    )
+    return preproc
+
+
+def draw_confusion_matrix(y_test, y_pred, model):
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'], annot_kws={"size": 14})
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title('Confusion Matrix')
+    plt.show()
+    save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
+
+
+def draw_roc_curve(X_test, y_pred_prob, model):
+    # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
+    
+    # Calcul de l'aire sous la courbe ROC (AUC)
+    roc_auc = auc(fpr, tpr)
+    
+    plt.figure(figsize = (14, 8))
+    plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
+    plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
+    plt.grid(color='gray', linestyle='--', linewidth=0.5)
+    plt.xlabel("False Positive Rate", fontsize=14)
+    plt.ylabel("True Positive Rate", fontsize=14)
+    plt.title("ROC Curve", size=18)
+    plt.legend(loc="lower right", fontsize=14)
+    plt.show()
+    save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
+
+
+def draw_calibration_curve(X_test, y_pred_prob, model):
+    frac_pos, mean_pred = calibration_curve(y_test,  y_pred_prob, n_bins=10)
+
+    # Plot the calibration curve
+    plt.plot(mean_pred, frac_pos, 's-', label=model)
+    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+    plt.xlabel('Mean predicted value')
+    plt.ylabel('Fraction of positive predictions')
+    plt.title("Calibration Curve")
+    plt.legend()
+    plt.show()
+    save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
+
+
+def draw_features_importance(pipeline, model, randomF = False):
+    if randomF:
+        coefficients = pipeline.named_steps[model].feature_importances_
+    else: 
+        coefficients = pipeline.named_steps[model].coef_[0]
+    
+    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
+    # Tracer l'importance des caractéristiques
+    plt.figure(figsize=(12, 8))
+    plt.barh(feature_names, coefficients, color='skyblue')
+    plt.xlabel("Features' Importance")
+    plt.ylabel('Caractéristiques')
+    plt.title("Features' Importance")
+    plt.grid(True)
+    plt.show()
+    save_file_s3("Features_", type_of_activity, type_of_model, model)
+
+
+def draw_prob_distribution(y_pred_prob, model):
+    plt.figure(figsize=(10, 8))
+    plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
+    
+    plt.xlim(0, 1)
+    plt.ylim(0, None)
+    
+    plt.title('Histogramme des probabilités pour la classe 1')
+    plt.xlabel('Probability')
+    plt.ylabel('Frequency')
+    plt.grid(True)
+    plt.show()
+    save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
+
+
+def draw_prob_distribution_companies(y_pred_prob, model):
+    test = dataset_test.copy()
+    test['probability to buy'] = y_pred_prob
+    test['company'] = test['customer_id'].str.split('_', expand=True)[0]
+    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
+             stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
+    plt.xlim(0, 1)
+    plt.ylim(0, None)
+    plt.title('Histogram of probabilities for class 1 by company')
+    plt.xlabel('Probability')
+    plt.ylabel('Frequency')
+    plt.grid(True)
+    plt.show()
+    save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
+
+
+
+def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
+                                  max_iter=5000, n_jobs=-1))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "LogisticRegression_Benchmark"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
+
+
+def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
+    y_train = y_train['y_has_purchased']
+    param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
+              'LogisticRegression_cv__penalty': ['l1', 'l2'],
+               'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} 
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))  
+])
+    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
+                          n_jobs=-1)
+
+    grid_search.fit(X_train, y_train)
+    y_pred = grid_search.predict(X_test)
+    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
+    best_pipeline = grid_search.best_estimator_
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "LogisticRegression_cv"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(best_pipeline, 'LogisticRegression_cv')
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
+    return model_result
+
+
+def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('randomF', RandomForestClassifier(class_weight = weight_dict,
+                                  n_jobs=-1))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "randomF"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(pipeline, 'randomF', randomF=True)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
+
+
+def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
+    y_train = y_train['y_has_purchased']
+    param_grid = {
+    'randomF_cv__n_estimators': [100, 300],
+    'randomF_cv__max_features': ['sqrt', 'log2'],
+    'randomF_cv__min_samples_split': [2, 10],
+    'randomF_cv__min_samples_leaf': [1, 4],
+    'randomF_cv__class_weight': [weight_dict]
+}
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('randomF_cv', RandomForestClassifier(n_jobs=-1))  
+])
+    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
+                          n_jobs=-1)
+
+    grid_search.fit(X_train, y_train)
+    y_pred = grid_search.predict(X_test)
+    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
+    best_pipeline = grid_search.best_estimator_
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "randomF_cv"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
+    return model_result
+
+
+def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
+    unique_classes, counts = np.unique(y_train, return_counts=True)
+    class_priors = counts / counts.sum()
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('Naive_Bayes', GaussianNB(priors=class_priors))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "Naive_Bayes"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
--- a/utils_sales_forecast.py
+++ b/utils_sales_forecast.py
@ -0,0 +1,325 @@
+# importations
+import pandas as pd
+from pandas import DataFrame
+import numpy as np
+import os
+import s3fs
+import matplotlib.pyplot as plt
+from scipy.optimize import fsolve
+import pickle
+import warnings
+import io
+
+# functions
+
+def load_train_test(type_of_activity):
+    """
+    Loads the training and test datasets from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Training dataset.
+    DataFrame: Test dataset.
+    """
+    
+    # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
+    BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
+    File_path_train = BUCKET + "/Train_set.csv"
+    File_path_test = BUCKET + "/Test_set.csv"
+    
+    with fs.open( File_path_train, mode="rb") as file_in:
+        dataset_train = pd.read_csv(file_in, sep=",")
+        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
+
+    with fs.open(File_path_test, mode="rb") as file_in:
+        dataset_test = pd.read_csv(file_in, sep=",")
+        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
+    
+    return dataset_train, dataset_test
+
+
+def features_target_split(dataset_train, dataset_test):
+    """
+    Splits the dataset into features and target variables for training and testing.
+
+    Args:
+    - dataset_train (DataFrame): Training dataset.
+    - dataset_test (DataFrame): Test dataset.
+
+    Returns:
+    DataFrame: Features of the training dataset.
+    DataFrame: Features of the test dataset.
+    DataFrame: Target variable of the training dataset.
+    DataFrame: Target variable of the test dataset.
+    """
+    
+    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
+            'time_between_purchase', 'fidelity',  'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
+            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
+    
+    X_train = dataset_train # [features_l]
+    y_train = dataset_train[['y_has_purchased']]
+
+    X_test = dataset_test # [features_l]
+    y_test = dataset_test[['y_has_purchased']]
+    
+    return X_train, X_test, y_train, y_test
+    
+
+def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
+    # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
+    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
+    filename = model + '.pkl'
+    file_path = BUCKET + filename
+    with fs.open(file_path, mode="rb") as f:
+        model_bytes = f.read()
+
+    model = pickle.loads(model_bytes)
+    return model
+    
+
+def df_segment(df, y, model) :
+    """
+    Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
+
+    Args:
+    - df (DataFrame): DataFrame to be segmented.
+    - y (Series): True target variable.
+    - model (Model): Pre-trained machine learning model for prediction.
+
+    Returns:
+    DataFrame: Segmented DataFrame with predicted values and true values for y.
+    """
+
+    y_pred = model.predict(df)
+    y_pred_prob = model.predict_proba(df)[:, 1]
+
+    df_segment = df
+
+    df_segment["has_purchased"] = y
+    df_segment["has_purchased_estim"] = y_pred
+    df_segment["score"] = y_pred_prob
+    df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
+                       np.where(df_segment['score']<0.5, '2',
+                       np.where(df_segment['score']<0.75, '3', '4')))
+
+    return df_segment
+    
+
+def odd_ratio(score) :
+    """
+    Args:
+    - score (Union[float, int])
+
+    Returns:
+    float: Odd ratio value.
+    """
+    
+    return score / (1 - score)
+
+
+def adjust_score_1(score) :
+    """
+    Adjust scores by replacing ones with the second highest value.
+    Allows to compute odd ratios then.
+
+    Args:
+    - score (List[Union[float, int]])
+
+    Returns:
+    np.ndarray: Adjusted score values.
+    """
+    
+    second_best_score = np.array([element for element in score if element !=1]).max()
+    new_score = np.array([element if element!=1 else second_best_score for element in score])    
+    return new_score
+
+
+def adjusted_score(odd_ratio, bias) :
+    """
+    Adjust the score based on the odd ratio and bias.
+
+    Args:
+    - odd_ratio (Union[float, int])
+    - bias (Union[float, int])
+
+    Returns:
+    float: Adjusted score value.
+    """
+    
+    adjusted_score = odd_ratio/(bias+odd_ratio)
+    return adjusted_score
+
+
+def find_bias(odd_ratios, y_objective, initial_guess=10) :
+    """
+    Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed. 
+
+    Args:
+    - odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
+    - y_objective (Union[float, int]): Objective value => total number of purchases.
+    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
+
+    Returns:
+    float: Estimated bias value.
+    """
+
+    bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
+ 
+    return bias_estimated[0]
+    
+    
+def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
+    """
+    Plot a histogram comparing scores and adjusted scores.
+
+    Args:
+    - df (DataFrame): DataFrame containing score data.
+    - score (str): Name of the column in df representing the original scores.
+    - score_adjusted (str): Name of the column in df representing the adjusted scores.
+    - type_of_activity (str) : type of activity of the companies considered.
+
+    Returns:
+    None
+    """
+
+    plt.figure()
+    plt.hist(df[score], label = "score", alpha=0.6)
+    plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6)
+    plt.legend()
+    plt.xlabel("probability of a future purchase")
+    plt.ylabel("count")
+    plt.title(f"Comparison between score and adjusted score for {type_of_activity} companies")
+    # plt.show()
+
+
+def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
+    """
+    Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection. 
+
+    Args:
+    - df (DataFrame): DataFrame containing information about past sales.
+    - nb_purchases (str) : Name of the column in df representing the number of purchases.
+    - nb_tickets (str): Name of the column in df representing the number of tickets.
+    - total_amount (str): Name of the column in df representing the total amount.
+    - score_adjusted (str): Name of the column in df representing the adjusted score.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
+    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 
+
+    Returns:
+    DataFrame: DataFrame completed with sales and total amount projections. 
+    """
+    
+    duration_ratio = duration_ref/duration_projection
+
+    df_output = df
+    
+    # project number of tickets : at least 1 ticket purchased if the customer purchased
+    df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets].apply(lambda x : max(1, x /duration_ratio))
+
+    # project amount : if the customer buys a ticket, we expect the amount to be at least the average price of tickets 
+    # for customers purchasing exactly one ticket
+    if df_output.loc[df_output[nb_tickets]==1].shape[0] > 0 :
+        avg_price = df_output.loc[df_output[nb_tickets]==1][total_amount].mean()
+    else :
+        avg_price = df_output[total_amount].mean()
+
+    # we compute the avg price of ticket for each customer
+    df_output["avg_ticket_price"] = df_output[total_amount]/df_output[nb_tickets]
+
+    # correct negatives total amounts
+    df_output.loc[:,"total_amount_corrected"] = np.where(df_output[total_amount] < 0, 
+                                                         avg_price * df_output[nb_tickets],
+                                                         df_output[total_amount])
+
+    df_output.loc[:,"total_amount_projected"] = np.where(
+        # if no ticket bought in the past, we take the average price
+        df_output[nb_tickets]==0, avg_price,
+        # if avg prices of tickets are negative, we recompute the expected amount based on the avg price of a ticket
+        # observed on the whole population
+        np.where(X_test_segment["avg_ticket_price"] < 0, avg_price * df_output.loc[:,"nb_tickets_projected"],
+        # else, the amount projected is the average price of tickets bought by the customer * nb tickets projected
+                 df_output["avg_ticket_price"] * df_output.loc[:,"nb_tickets_projected"])
+        )
+    
+    df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
+    df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
+
+    df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
+    
+    return df_output
+
+
+def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
+                       duration_ref=17, duration_projection=12) :  
+    """
+    Generate a summary of expected customer sales based on segments.
+
+    Args:
+    - df (DataFrame): DataFrame containing customer data.
+    - segment (str): Name of the column in df representing customer segments.
+    - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
+    - total_amount_expected (str): Name of the column in df representing the expected total amount.
+    - total_amount (str): Name of the column in df representing the total amount.
+    - pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
+    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 
+
+    Returns:
+    DataFrame: Summary DataFrame containing expected customer sales metrics.
+    """
+    
+    # compute nb tickets estimated and total amount expected
+    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()
+    
+    # number of customers by segment
+    df_expected_CA.insert(1, "size", df.groupby(segment).size().values)
+    
+    # size in percent of all customers
+    df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
+    
+    # compute share of CA recovered
+    duration_ratio=duration_ref/duration_projection
+    
+    df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
+    df.groupby(segment)[total_amount].sum().values
+
+    df_expected_CA["share_future_revenue_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
+    df[total_amount].sum()
+
+    df_drop_null_pace = df.dropna(subset=[pace_purchase])
+    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
+    
+    return df_expected_CA
+
+
+def save_file_s3_ca(File_name, type_of_activity):
+    """
+    Saves a file in S3 storage.
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+    """
+    
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png', dpi=120)
+    image_buffer.seek(0)
+    PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
--- a/utils_segmentation.py
+++ b/utils_segmentation.py
@ -0,0 +1,335 @@
+# functions for segmentation and graphics associated
+
+def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
+    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
+    filename = model + '.pkl'
+    file_path = BUCKET + filename
+    with fs.open(file_path, mode="rb") as f:
+        model_bytes = f.read()
+
+    model = pickle.loads(model_bytes)
+    return model
+
+
+def load_test_file(type_of_activity):
+    """
+    Load the test dataset from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Test dataset.
+    """
+    
+    file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
+    with fs.open(file_path_test, mode="rb") as file_in:
+        dataset_test = pd.read_csv(file_in, sep=",")
+    return dataset_test
+
+
+def save_file_s3_mp(File_name, type_of_activity):
+    """
+    Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified. 
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png', dpi=110)
+    image_buffer.seek(0)
+    PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+def save_txt_file_s3(file_name, type_of_activity, content):
+    """
+    Save a text file to S3 storage to the location assigned for the type of activity specified.
+
+    Args:
+    - file_name (str)
+    - type_of_activity (str)
+    - content (str)
+
+    Returns:
+    None
+    """
+    
+    FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
+    with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
+        s3_file.write(content)
+        
+def df_business_fig(df, segment, list_var) :
+    """
+    Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data.
+    - segment (str): The column name representing segments.
+    - list_var (list of str): The list of variable names to be aggregated.
+
+    Returns:
+    DataFrame: The DataFrame containing business KPIs.
+    """
+    
+    df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
+    df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
+    all_var = ["size"] + list_var
+    df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
+
+    return df_business_kpi
+
+
+def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
+    """
+    Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
+
+    Args:
+    - df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
+    - segment (str): The column name representing segments.
+    - size (str): The column name representing the size.
+    - nb_tickets (str): The column name representing the number of tickets.
+    - nb_purchases (str): The column name representing the number of purchases.
+    - total_amount (str): The column name representing the total amount.
+    - nb_campaigns (str): The column name representing the number of campaigns.
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
+    plt.figure()
+
+    df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
+    
+    x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount", 
+         "number of\ncampaigns"]
+    
+    bottom = np.zeros(5)
+    
+    # types of blue color
+    colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
+    
+    for i in range(4) :
+        height = list(df_plot.loc[i,size:].values)
+        plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
+        bottom+=height
+
+    # Ajust margins
+    plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
+   
+    plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
+    plt.ylabel("Fraction represented by the segment (%)")
+    plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
+    # plt.show()
+
+
+# def df_segment_mp(df) :
+#     df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
+#     df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
+#     df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
+#     return df_mp
+
+
+# def df_segment_pb (df) :
+#     df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
+#     return df_pb
+
+
+def radar_mp_plot(df, categories, index) :
+    """
+    Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
+    - categories (list of str):
+    - index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
+
+    Returns:
+    None
+    """
+    
+    categories = categories
+
+    # true values are used to print the true value in parenthesis
+    tvalues = list(df.loc[index,categories]) 
+
+    max_values = df[categories].max()
+
+    # values are true values / max among the 4 segments, allows to 
+    # put values in relation with the values for other segments
+    # if the point has a maximal abscisse it means that value is maximal for the segment considered
+    # , event if not equal to 1
+    
+    values = list(df.loc[index,categories]/max_values)
+                  
+    # values normalized are used to adjust the value around the circle
+    # for instance if the maximum of values is equal to 0.8, we want the point to be 
+    # at 8/10th of the circle radius, not at the edge 
+    values_normalized = [ max(values) * elt for elt in values]
+
+    # Nb of categories
+    num_categories = len(categories)
+
+    angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
+    
+    # Initialize graphic
+    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
+    
+    # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
+    # which is based on max(value)
+    # if we don't plot this transparent line, the radius of the circle will be too small
+    ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
+    ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
+    
+    # fill the sector
+    ax.fill(angles, values_normalized, color='orange', alpha=0.4)
+    
+    # labels
+    ax.set_yticklabels([])
+    ax.set_xticks(angles)
+    ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
+    ax.set_xticklabels(ticks, color="black")
+    
+    ax.spines['polar'].set_visible(False)
+    
+    plt.title(f'Characteristics of the segment {index+1}\n')
+    
+    # plt.show()
+
+
+def radar_mp_plot_all(df, type_of_activity) :
+    """
+    Plot exactly the same radar charts as radar_mp_plot, but for all segments. 
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
+    # table summarizing variables relative to marketing personae
+    df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
+    #df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
+    df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))
+
+    # table relative to purchasing behaviour
+    df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
+
+    # concatenation of tables to prepare the plot
+    df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)
+
+    # rename columns for the plot
+    df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})
+
+    # visualization
+    nb_segments = df_used.shape[0]
+    categories = list(df_used.drop("segment", axis=1).columns)
+
+    var_not_perc = ["age"]
+
+    # Initialize graphic
+    fig, ax = plt.subplots(2,2, figsize=(20, 21), subplot_kw=dict(polar=True))
+    
+    for index in range(nb_segments) :
+        row = index // 2  # Division entière pour obtenir le numéro de ligne
+        col = index % 2 
+    
+        # true values are used to print the true value in parenthesis
+        tvalues = list(df_used.loc[index,categories]) 
+        
+        max_values = df_used[categories].max()
+        
+        # values are true values / max among the 4 segments, allows to 
+        # put values in relation with the values for other segments
+        # if the point has a maximal abscisse it means that value is maximal for the segment considered
+        # , event if not equal to 1
+
+        values = list(df_used.loc[index,categories]/max_values)
+                          
+        # values normalized are used to adjust the value around the circle
+        # for instance if the maximum of values is equal to 0.8, we want the point to be 
+        # at 8/10th of the circle radius, not at the edge 
+        values_normalized = [ max(values) * elt for elt in values]
+        
+        # Nb of categories
+        num_categories = len(categories)
+    
+        angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
+    
+        # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
+        # which is based on max(value)
+        # if we don't plot this transparent line, the radius of the circle will be too small
+        ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
+        ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
+                          linewidth=1.2)
+        
+        # fill the sector
+        ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
+        
+        # labels
+        ax[row, col].set_yticklabels([])
+        ax[row, col].set_xticks(angles)
+
+        # define the ticks
+        values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]       
+        ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]
+        ax[row, col].set_xticklabels(ticks, color="black", size = 20)
+
+        ax[row, col].spines['polar'].set_visible(False)
+        
+        ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
+        
+    fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
+
+    plt.tight_layout()
+    # plt.show()
+
+
+def known_sociodemo_caracteristics(df, type_of_activity) :
+    """
+    Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
+    table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
+    table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
+    table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
+    
+    # Arrondir les valeurs du DataFrame à une décimale
+    table_share_known_rounded = table_share_known.round(1)
+    
+    # Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
+    latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
+    latex_table = latex_table.replace('%', '\\%')
+
+    save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)
+    
+
--- a/utils_stat_desc.py
+++ b/utils_stat_desc.py
@ -0,0 +1,467 @@
+
+
+def load_files(nb_compagnie):
+    """
+    load and preprocess dataframes
+    """
+    
+    customer = pd.DataFrame()
+    campaigns_brut = pd.DataFrame()
+    campaigns_kpi = pd.DataFrame()
+    products = pd.DataFrame()
+    tickets = pd.DataFrame()
+    targets = pd.DataFrame()
+    
+    for directory_path in nb_compagnie:
+        df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
+        df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
+        df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+        df_target_information = display_input_databases(directory_path, file_name = "target_information")
+        
+        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC')) 
+        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
+        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
+        df_target_KPI = targets_KPI(df_target = df_target_information)
+
+        # Merge and 
+        df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
+        targets_columns = list(df_target_KPI.columns)
+        targets_columns.remove('customer_id')
+        df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
+        
+    # Create company identifier
+        df_tickets_kpi["number_company"]=int(directory_path)
+        df_campaigns_brut["number_company"]=int(directory_path)
+        df_campaigns_kpi["number_company"]=int(directory_path)
+        df_customerplus_clean["number_company"]=int(directory_path)
+        df_target_information["number_company"]=int(directory_path)
+        df_target_KPI["number_company"]=int(directory_path)
+    
+    # Clean index
+        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
+        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
+        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
+        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
+        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
+        
+    # Remove companies' outliers
+        df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
+    # harmonize set of customers across databases
+        customer_id = df_tickets_kpi['customer_id'].to_list()
+        for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
+            dataset = dataset[dataset['customer_id'].isin(customer_id)]
+        
+        df_target_KPI["customer_id"]= directory_path + '_' +  df_target_KPI['customer_id'].astype('str')
+
+    
+    # Concatenation
+        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
+        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
+        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
+        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
+        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
+        targets = pd.concat([targets, df_target_KPI], ignore_index=True)
+
+    return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
+
+
+def remove_outlier_total_amount(tickets : pd.DataFrame):
+    Q1 = tickets['total_amount'].quantile(0.25)
+    Q3 = tickets['total_amount'].quantile(0.75)
+    IQR = Q3 - Q1
+    upper =  Q3 +1.5*IQR
+    outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
+    tickets = tickets[~tickets['customer_id'].isin(outliers)]
+    return tickets
+    
+
+def save_file_s3(File_name, type_of_activity):
+    """
+    save plots into s3 storage
+    """
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150)
+    image_buffer.seek(0)
+    FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+
+def outlier_detection(tickets : pd.DataFrame, company_list, show_diagram=False):
+    """
+    detect anonymous customers
+    """
+    outlier_list = list()
+    
+    for company in company_list:
+        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
+        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
+        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
+        
+        total_amount_share_index = total_amount_share.set_index('customer_id')
+        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
+        #print('df circulaire : ', df_circulaire.head())
+        top = df_circulaire[:1]
+        #print('top : ', top)
+        outlier_list.append(top.index[0])
+        rest = df_circulaire[1:]
+    
+        rest_sum = rest.sum()
+        
+        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
+        
+        if show_diagram:
+            plt.figure(figsize=(3, 3))
+            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
+            plt.axis('equal')
+            # plt.title(f'Répartition des montants totaux pour la compagnie {company}')
+            plt.show()
+    return outlier_list
+
+
+def valid_customer_detection(products : pd.DataFrame, campaigns_brut : pd.DataFrame):
+    """
+    identify customer that are in our time perimeter
+    """
+    products_valid = products[products['purchase_date']>="2021-05-01"]
+    consumer_valid_product = products_valid['customer_id'].to_list()
+
+    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
+    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
+
+    consumer_valid = consumer_valid_product + consumer_valid_campaigns
+    return consumer_valid 
+
+
+def identify_purchase_during_target_periode(products : pd.DataFrame):
+    """
+    identify customer who purchased ticket during the target period
+    """
+    products_target_period = products[(products['purchase_date']>="2022-11-01")
+    & (products['purchase_date']<="2023-11-01")]
+    customer_target_period = products_target_period['customer_id'].to_list()
+    return customer_target_period
+
+    
+def remove_elements(lst, elements_to_remove):
+    return ''.join([x for x in lst if x not in elements_to_remove])
+    
+
+def compute_nb_clients(customer: pd.DataFrame, type_of_activity: str):
+    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
+    plt.figure(figsize=(4,3))
+    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
+    plt.xlabel('Company Number')
+    plt.ylabel("Number of clients (thousands)")
+    # plt.title(f"Number of clients Across {type_of_activity} Companies")
+    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
+    plt.show()
+    save_file_s3("nb_clients_", type_of_activity)
+
+
+def maximum_price_paid(customer: pd.DataFrame, type_of_activity: str):
+    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
+    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Maximal price of a ticket Prix")
+    # plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
+    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
+    plt.show()
+    save_file_s3("Maximal_price_", type_of_activity)
+
+
+def target_proportion(customer: pd.DataFrame, type_of_activity: str):
+    df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
+                                                 'customer_id' : 'nunique'}).reset_index()
+    df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
+    plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
+    plt.xlabel('Company Number')
+    plt.ylabel('Share (%)')
+    # plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
+    plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
+    plt.show()
+    save_file_s3("share_target_", type_of_activity)
+
+
+def mailing_consent(customer: pd.DataFrame, type_of_activity: str):
+    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
+    mailing_consent["opt_in"] *= 100
+    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
+
+    plt.xlabel('Company Number')
+    plt.ylabel('Mailing Consent (%)')
+    # plt.title(f'Consent of mailing Across {type_of_activity} Companies')
+    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
+    plt.show()
+    save_file_s3("mailing_consent_", type_of_activity)
+
+
+def mailing_consent_by_target(customer: pd.DataFrame, type_of_activity: str):
+    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
+    # Création du barplot groupé
+    fig, ax = plt.subplots(figsize=(5, 3))
+    
+    categories = df_graph["number_company"].unique()
+    bar_width = 0.35
+    bar_positions = np.arange(len(categories))
+    
+    # Grouper les données par label et créer les barres groupées
+    for label in df_graph["has_purchased_target_period"].unique():
+        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
+        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
+    
+        label_printed = "Purchase" if label else "No purchase"
+        ax.bar(bar_positions, values, bar_width, label=label_printed)
+    
+        # Mise à jour des positions des barres pour le prochain groupe
+        bar_positions = [pos + bar_width for pos in bar_positions]
+    
+    # Ajout des étiquettes, de la légende, etc.
+    ax.set_xlabel('Company Number')
+    ax.set_ylabel('Mailing Consent (%)')
+    # ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
+    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
+    ax.set_xticklabels(categories)
+    ax.legend()
+    
+    # Affichage du plot
+    plt.show()
+    save_file_s3("mailing_consent_target_", type_of_activity)
+
+
+def gender_bar(customer: pd.DataFrame, type_of_activity: str):
+    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
+    
+    company_genders["gender_male"] *= 100
+    company_genders["gender_female"] *= 100
+    company_genders["gender_other"] *= 100
+    
+    # Création du barplot
+    plt.figure(figsize=(4,3))
+    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
+    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
+            bottom = company_genders["gender_male"], label = "Female")
+    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
+            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Frequency (%)")
+    # plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
+    plt.legend()
+    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
+    plt.show()
+    save_file_s3("gender_bar_", type_of_activity)
+    
+
+def country_bar(customer: pd.DataFrame, type_of_activity: str):
+    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
+    company_country_fr["country_fr"] *= 100
+    plt.figure(figsize=(4,3))
+    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of French Customer (%)")
+    # plt.title(f"Share of French Customer Across {type_of_activity} Companies")
+    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
+    plt.show()
+    save_file_s3("country_bar_", type_of_activity)
+
+
+def lazy_customer_plot(campaigns_kpi: pd.DataFrame, type_of_activity: str):
+    company_lazy_customers = campaigns_kpi.groupby("number_company")[["nb_campaigns", "taux_ouverture_mail"]].mean().reset_index()
+    company_lazy_customers["taux_ouverture_mail"] *= 100
+    
+    # Initialize the figure
+    fig, ax1 = plt.subplots(figsize=(6, 3))
+    width = 0.4
+    x = range(len(company_lazy_customers))
+    
+    # Plot the bars for "nb_campaigns" on the first y-axis
+    ax1.bar([i - width/2 for i in x], company_lazy_customers['nb_campaigns'], width=width, align='center', label='Amount of Campaigns', color = 'steelblue')
+    
+    # Set labels and title for the first y-axis
+    ax1.set_ylabel('Number of Mails Received', color='steelblue')
+    ax1.tick_params(axis='y', labelcolor='steelblue')
+    
+    # Create another y-axis for "taux_ouverture_mail"
+    ax2 = ax1.twinx()
+    
+    # Plot the bars for "taux_ouverture_mail" on the second y-axis
+    ax2.bar([i + width/2 for i in x], company_lazy_customers['taux_ouverture_mail'], width=width, align='center', label='Open Mail Rate', color = 'darkorange')
+    
+    # Set labels and title for the second y-axis
+    ax2.set_ylabel('Open Mail Rate (%)', color='darkorange')
+    ax2.tick_params(axis='y', labelcolor='darkorange')
+    
+    # Set x-axis ticks and labels
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(company_lazy_customers['number_company'])
+    
+    plt.show()
+    save_file_s3("lazy_customer_", type_of_activity)
+
+
+def campaigns_effectiveness(customer: pd.DataFrame, type_of_activity: str):
+
+    campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
+
+    fig, ax = plt.subplots(figsize=(5, 3))
+    
+    categories = campaigns_effectiveness["number_company"].unique()
+    bar_width = 0.35
+    bar_positions = np.arange(len(categories))
+    
+    # Grouper les données par label et créer les barres groupées
+    for label in campaigns_effectiveness["has_purchased_target_period"].unique():
+        label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
+        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
+    
+        label_printed = "Purchase" if label else "No purchase"
+        ax.bar(bar_positions, values, bar_width, label=label_printed)
+    
+        # Mise à jour des positions des barres pour le prochain groupe
+        bar_positions = [pos + bar_width for pos in bar_positions]
+    
+    # Ajout des étiquettes, de la légende, etc.
+    ax.set_xlabel('Company Number')
+    ax.set_ylabel('Share of Consent (%)')
+    # ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class  ({type_of_activity} companies)")
+    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
+    ax.set_xticklabels(categories)
+    ax.legend()
+    plt.show()
+    save_file_s3("campaigns_effectiveness_", type_of_activity)
+
+
+def sale_dynamics(products : pd.DataFrame, campaigns_brut : pd.DataFrame, type_of_activity):
+    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
+    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
+    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
+    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
+    
+    # Mois du premier mails
+    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
+    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
+    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
+    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
+    
+    # Fusion 
+    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
+                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
+    
+    # Mois à partir duquel le client est considere comme connu
+    
+    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
+    
+    # Nombre de commande par mois
+    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
+    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
+    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
+    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
+    
+    # Nombre de commande par mois par type de client
+    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
+    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
+    
+    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
+    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
+    
+    # Graphique en nombre de commande
+    purchases_graph = nb_purchases_graph
+    
+    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
+    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
+    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
+    
+    
+    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
+    plt.figure(figsize=(5.5,4))
+
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
+            bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
+    
+    
+    # commande pr afficher slt
+    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
+    
+    plt.xlabel('Month')
+    plt.ylabel("Number of Sales")
+    # plt.title(f"Number of Sales Across {type_of_activity} Companies")
+    plt.legend()
+    plt.show()
+    save_file_s3("sale_dynamics_", type_of_activity)
+
+
+def tickets_internet(tickets: pd.DataFrame, type_of_activity: str):
+    nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
+    nb_tickets_internet['prop_purchases_internet'] *=100
+    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["prop_purchases_internet"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of Purchases Bought Online (%)")
+    # plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
+    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
+    plt.show()
+    save_file_s3("tickets_internet_", type_of_activity)
+    
+
+def already_bought_online(tickets: pd.DataFrame, type_of_activity: str):
+    nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
+                                                                        'customer_id' : 'nunique'}
+                                                                        ).reset_index())
+    nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
+
+    plt.bar(nb_consumers_online["number_company"],  nb_consumers_online["Share_consumers_internet"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of Customer who Bought Online at least once (%)")
+    # plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
+    plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
+    plt.show()
+    save_file_s3("First_buy_internet_", type_of_activity)
+    
+
+def box_plot_price_tickets(tickets: pd.DataFrame, type_of_activity: str):
+    price_tickets = tickets[(tickets['total_amount'] > 0)]
+    plt.figure(figsize=(4,3))
+    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
+    # plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
+    plt.xlabel('Company Number')
+    plt.ylabel("Total Amount Spent")
+    plt.show()
+    save_file_s3("box_plot_price_tickets_", type_of_activity)
+
+def target_description(targets : pd.DataFrame, type_of_activity: str):
+
+    describe_target = targets.groupby('number_company').agg(
+        prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
+        prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
+        prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
+        prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
+        prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
+        prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
+        prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
+        prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
+
+    plot = describe_target.plot.bar()
+    
+    # Adding a title
+    # plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
+    
+    # Adding labels for x and y axes
+    plot.set_xlabel("Company Number")
+    plot.set_ylabel("Target Proportion")
+
+    plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
+
+    
+    # Adding a legend
+    plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
+
+    save_file_s3("target_category_proportion_", type_of_activity)
+
+    
+
Author	SHA1	Message	Date
ajoubrel-ensae	4ed6bd809d	Suppression des notebooks exploratoires et brouillons	2024-04-09 20:20:57 +00:00
tpique-ensae	9ca22fb9e7	changed names	2024-04-04 18:48:46 +00:00
tpique-ensae	6da3467108	fixed tipos	2024-04-04 18:46:38 +00:00
tpique-ensae	473f8100b0	added packages versions	2024-04-04 14:57:39 +00:00
tpique-ensae	68b68ed3da	added functions documentation	2024-04-04 14:29:16 +00:00
arevelle-ensae	f5b6075431	adjust font size	2024-04-04 11:46:15 +00:00
arevelle-ensae	1ebb83e3c4	fix mailing consent	2024-04-04 08:58:48 +00:00
arevelle-ensae	e54e6c3b10	add type of variables	2024-04-04 08:58:34 +00:00
arevelle-ensae	df4c28bdd8	add function description	2024-04-04 08:39:43 +00:00
arevelle-ensae	09f4bd3fe4	push coefficient	2024-04-04 06:50:49 +00:00
tpique-ensae	b9aa0d7578	fix tipo	2024-04-03 19:30:04 +00:00
tpique-ensae	5fa57cb4b9	final changes (I hope so)	2024-04-03 19:28:52 +00:00
tpique-ensae	0f5c9cb70f	test	2024-04-03 19:25:43 +00:00
tpique-ensae	7bf011e2ed	est	2024-04-03 19:24:59 +00:00
tpique-ensae	f4b430dbc1	test	2024-04-03 19:22:34 +00:00
tpique-ensae	7d7683b0a9	test	2024-04-03 19:21:06 +00:00
tpique-ensae	d14174dc07	test	2024-04-03 19:16:29 +00:00
tpique-ensae	c5aca36640	some changes	2024-04-03 19:15:52 +00:00
tpique-ensae	a3caa64c95	completed readme	2024-04-03 19:12:06 +00:00
tpique-ensae	15f950d87f	test some changes	2024-04-03 18:37:19 +00:00
tpique-ensae	acf7621d9a	fixed forecasting issues	2024-04-03 10:36:47 +00:00
ajoubrel-ensae	14953b031a	Add better dpi	2024-04-02 21:27:28 +00:00
ajoubrel-ensae	ea3dcbb015	Amélioration graphique lazy + meilleur cadrage + enlever titre	2024-04-02 21:12:07 +00:00
tpique-ensae	091693c889	added printing options for business KPIs tables	2024-04-02 12:09:01 +00:00
tpique-ensae	197703a857	final changes for spider chart	2024-04-02 11:59:06 +00:00
tpique-ensae	41decc7acd	last minor adjustment for spider chart	2024-04-02 11:47:26 +00:00
tpique-ensae	a21805db9b	minor change : adjusted size of spider chart	2024-04-02 11:36:34 +00:00
tpique-ensae	21bf0c8408	use cv logit instead of benchmark for the segmentation	2024-04-02 11:26:06 +00:00
tpique-ensae	4e74483a69	augmentation résolution des graphiques	2024-04-01 10:19:59 +00:00
tpique-ensae	c96e1b5f0c	logit cv au lieu de benchmark	2024-04-01 01:18:53 +00:00
tpique-ensae	52b39e03be	final changes for spider charts	2024-03-31 21:59:52 +00:00
tpique-ensae	b9a3d05a2f	tests to prepare changes in code 06	2024-03-31 17:57:10 +00:00
ajoubrel-ensae	1a62d2b60a	Changement path	2024-03-31 17:16:46 +00:00
ajoubrel-ensae	e5c99f09ab	Changement dossier	2024-03-31 17:03:49 +00:00
ajoubrel-ensae	1577cc3291	Correction path	2024-03-31 17:02:33 +00:00
ajoubrel-ensae	ad1e9034f7	Changement nom et path	2024-03-31 16:54:46 +00:00
ajoubrel-ensae	8e61e9d2a4	Ajout description marketing personae	2024-03-31 16:35:58 +00:00
ajoubrel-ensae	7341752be0	Changement nom fichier	2024-03-31 16:35:21 +00:00
ajoubrel-ensae	35638f2a2d	Passage à demande input à boucle sur activite	2024-03-31 16:34:55 +00:00
tpique-ensae	0a7900c07f	take new databases as input	2024-03-30 11:00:49 +00:00
tpique-ensae	78aab14164	added age importation	2024-03-29 12:43:36 +00:00
Alexis REVELLE	8485bd755e	Merge pull request 'generalization' (#16 ) from generalization into main Reviewed-on: #16	2024-03-29 11:15:55 +01:00
arevelle-ensae	354f6847b6	standard model	2024-03-29 10:15:28 +00:00
arevelle-ensae	d6e2b2c57a	fix path	2024-03-29 10:14:14 +00:00
ajoubrel-ensae	42b4414a16	Changement architecture p1	2024-03-28 21:18:08 +00:00
ajoubrel-ensae	3d6414728c	Merge branch 'main' into segment_value	2024-03-28 20:44:06 +00:00
tpique-ensae	7be4179de4	added README	2024-03-28 16:48:22 +00:00
Alexis REVELLE	4facf5567c	Merge pull request 'fix time to open' (#15 ) from generalization into main Reviewed-on: #15	2024-03-28 15:23:42 +01:00
arevelle-ensae	7ed8516009	fix time to open	2024-03-28 13:17:45 +00:00
tpique-ensae	56ee61e25f	adjusted graphic options	2024-03-28 13:13:13 +00:00
Alexis REVELLE	0aed0911a1	Merge pull request 'fix loading' (#14 ) from generalization into main Reviewed-on: #14	2024-03-28 12:39:50 +01:00
arevelle-ensae	d5ab3c2d68	fix loading	2024-03-28 11:39:16 +00:00
Alexis REVELLE	0bd29e3a81	Merge pull request 'fix preproc' (#13 ) from generalization into main Reviewed-on: #13	2024-03-28 12:38:00 +01:00
arevelle-ensae	840ce876e2	fix preproc	2024-03-28 11:37:33 +00:00
Alexis REVELLE	b268cd980d	Merge pull request 'fix premium' (#12 ) from generalization into main Reviewed-on: #12	2024-03-28 12:19:21 +01:00
arevelle-ensae	02a4ea20dd	fix premium	2024-03-28 11:19:05 +00:00
tpique-ensae	25a356d6a4	tested adjustment of scores for different models	2024-03-28 10:37:23 +00:00
tpique-ensae	f0f69d710a	added activity in the titles of graphics	2024-03-28 09:27:29 +00:00
Alexis REVELLE	eb87cc6998	Merge pull request 'generalization' (#11 ) from generalization into main Reviewed-on: #11	2024-03-28 09:40:04 +01:00
arevelle-ensae	7debe6590e	fix conflicts	2024-03-28 08:38:43 +00:00
arevelle-ensae	122c4c1f82	fix features	2024-03-28 08:35:02 +00:00
arevelle-ensae	ebdbacbe34	fix features	2024-03-28 07:56:36 +00:00
arevelle-ensae	adc62dd056	save at different steps	2024-03-28 07:37:10 +00:00
ajoubrel-ensae	6d40cfe261	Ajout exploration is_partner	2024-03-27 22:21:26 +00:00
tpique-ensae	10824e5e24	CA estimation by segment works well	2024-03-27 18:59:05 +00:00
ajoubrel-ensae	be0bcda0ba	Complétion brouillon	2024-03-27 18:55:11 +00:00
ajoubrel-ensae	0ffbe06b12	Ajout graphiques sur targets	2024-03-27 18:39:54 +00:00
tpique-ensae	d3e13f4c56	completed CA projection	2024-03-27 17:58:30 +00:00
tpique-ensae	cf0b33c940	added input to select type of activity	2024-03-27 15:57:24 +00:00
tpique-ensae	905072b1db	now file works well	2024-03-27 15:42:27 +00:00
tpique-ensae	bfa941f0a3	minor change	2024-03-27 15:20:28 +00:00
tpique-ensae	8d33c74d2f	activity changed to type of activity	2024-03-27 15:10:57 +00:00
tpique-ensae	f55ade48b4	minor change	2024-03-27 15:06:31 +00:00
tpique-ensae	10fde045e5	from notebook to .py for segment analysis	2024-03-27 14:59:33 +00:00
arevelle-ensae	133eb83e84	add path premium	2024-03-27 14:08:40 +00:00
arevelle-ensae	f4b6f23394	Merge branch 'main' into generalization	2024-03-27 09:07:21 +00:00
ajoubrel-ensae	38c3fc3148	Ajout tableau recap des target detectés	2024-03-26 22:01:33 +00:00
ajoubrel-ensae	e2d55e557e	Inutile car repris dans la pipeline	2024-03-26 22:01:11 +00:00
ajoubrel-ensae	dd5c3f416b	v2	2024-03-26 21:07:29 +00:00
tpique-ensae	28cc7b94ea	added options to save plots in S3	2024-03-26 15:00:39 +00:00
tpique-ensae	2165c7c16e	completed segment mp analysis sport	2024-03-26 11:20:03 +00:00
ajoubrel-ensae	5e37dd4d3d	Ajout graph	2024-03-26 10:51:02 +00:00
ajoubrel-ensae	e1f6f1ba68	Ajout graph pour targets	2024-03-26 10:49:09 +00:00
ajoubrel-ensae	c620f23507	Enlever drop pour description segmentation	2024-03-26 10:48:50 +00:00
ajoubrel-ensae	a32cbe70e4	Add variables for segmentation description	2024-03-25 17:44:31 +00:00
ajoubrel-ensae	c86c43cc7e	Nouveau datasets	2024-03-24 19:01:29 +00:00
ajoubrel-ensae	f5f993aba0	On enlève les variables pré-calculés	2024-03-24 18:43:30 +00:00
tpique-ensae	dbd87dadd9	fixed export hist.png issue	2024-03-24 10:44:22 +00:00
tpique-ensae	ba6c4a8a24	update notebook	2024-03-24 10:05:28 +00:00
tpique-ensae	c549752ba7	added exportation to MinIo option	2024-03-24 09:42:44 +00:00
tpique-ensae	ca30d1daa3	update CA segment analysis	2024-03-23 16:23:59 +00:00
ajoubrel-ensae	7a9548f295	Ajout variables KPI targets	2024-03-23 11:51:18 +00:00
ajoubrel-ensae	1a0a5a40cf	Ajout analyses	2024-03-23 09:48:47 +00:00
tpique-ensae	c1cb3ab396	added utils for CA estimation	2024-03-23 09:18:43 +00:00
tpique-ensae	1c8e19a70d	clean notebook : adjust scores and estimates ca	2024-03-22 23:04:49 +00:00
tpique-ensae	a88c2df8f5	exploratory step : CA by segment	2024-03-22 09:15:59 +00:00
tpique-ensae	33df2fda4f	added summary with weights	2024-03-21 13:21:40 +00:00
arevelle-ensae	52fd738fe5	fix errors	2024-03-21 10:47:40 +00:00
tpique-ensae	a85036ad23	added summary for logit with penalty	2024-03-21 08:18:31 +00:00
arevelle-ensae	089a8fd3d6	fix labels	2024-03-21 08:16:29 +00:00
arevelle-ensae	b1e877508b	Merge branch 'main' into generalization	2024-03-21 07:14:51 +00:00
arevelle-ensae	9763dfe7f9	add result by companies	2024-03-21 07:10:10 +00:00
Antoine JOUBREL	a0256c551b	Merge pull request 'generalization' (#9 ) from generalization into main Reviewed-on: #9	2024-03-20 21:26:04 +01:00
arevelle-ensae	5cd1bcc222	add probability	2024-03-20 13:07:33 +00:00
arevelle-ensae	3d03965084	commit segmentation	2024-03-20 12:07:24 +00:00
arevelle-ensae	605876dfb1	save model to pickle	2024-03-20 12:06:47 +00:00
arevelle-ensae	fbfc03a572	look at graph	2024-03-20 09:27:03 +00:00
arevelle-ensae	95c4c6c4bf	fix errors	2024-03-20 08:33:56 +00:00
arevelle-ensae	0a41641956	stat desc	2024-03-19 13:54:31 +00:00
arevelle-ensae	57cc7d077d	Merge branch 'main' into generalization	2024-03-19 12:06:58 +00:00
arevelle-ensae	d328caa665	add pipeline ML	2024-03-19 11:46:04 +00:00
arevelle-ensae	ee86fcaf84	reduce random forest param grid	2024-03-19 11:43:44 +00:00
ajoubrel-ensae	ef23181a05	Ajout achat par mois	2024-03-18 21:50:30 +00:00
arevelle-ensae	9e5e364aa3	add steps	2024-03-18 19:38:01 +00:00
arevelle-ensae	969cb8ec43	add machine learning automatisation	2024-03-18 16:23:52 +00:00
arevelle-ensae	9155b397e9	utils_ml	2024-03-18 16:22:29 +00:00
arevelle-ensae	6ac62d9957	Merge branch 'main' into generalization	2024-03-18 16:21:42 +00:00
tpique-ensae	52119c4354	added segement	2024-03-18 15:58:38 +00:00
tpique-ensae	2bd3edb444	added segmentation to the model	2024-03-18 15:49:36 +00:00
tpique-ensae	b892ca79c7	added segementation to the model	2024-03-18 15:47:05 +00:00
arevelle-ensae	9a0ac320d0	add benchmark random forest	2024-03-18 09:35:48 +00:00
arevelle-ensae	5408ce677b	add calibration curve	2024-03-18 09:10:28 +00:00
tpique-ensae	6eddec93bc	completed with random forest + naive bayes	2024-03-17 11:49:48 +00:00
ajoubrel-ensae	cc30d7deb9	Ajout KPI sur customerplus	2024-03-16 17:20:47 +00:00
tpique-ensae	ab3b033f09	delete former logit pipeline file	2024-03-16 15:18:21 +00:00
tpique-ensae	746f764973	finished : logit pipeline + visu (sports)	2024-03-16 15:16:34 +00:00
ajoubrel-ensae	4c7bdf712b	Correction + fait tourner customerplus	2024-03-16 15:01:52 +00:00
ajoubrel-ensae	f8dc99df99	Merge branch 'correction_variables'	2024-03-16 14:48:32 +00:00
ajoubrel-ensae	53f32000b5	Changement sur customerplus	2024-03-16 14:47:46 +00:00
tpique-ensae	14423b1d34	init full modelization	2024-03-16 10:43:11 +00:00
tpique-ensae	83a3c039ec	baseline logit - exploratory study of variables	2024-03-16 09:42:41 +00:00
arevelle-ensae	15c102682a	fix errors	2024-03-14 23:02:50 +00:00
ajoubrel-ensae	3670299a0b	Ajout brouillon	2024-03-14 22:35:25 +00:00
ajoubrel-ensae	dc5e3d0df1	correction, renommer, remplir NaN pour tickets et mail	2024-03-14 22:34:36 +00:00
arevelle-ensae	db6eaaaa8d	debug	2024-03-14 21:14:40 +00:00
arevelle-ensae	54fbad0344	identify target customer	2024-03-14 21:00:14 +00:00
arevelle-ensae	6d0f67bd31	fix filter customer	2024-03-14 19:11:09 +00:00
arevelle-ensae	4ac11c6b37	fix some plots	2024-03-14 19:04:03 +00:00
arevelle-ensae	d42e81449a	generalize statistics	2024-03-14 18:35:03 +00:00
tpique-ensae	ac6a3b365f	created logit statsmodels (spectacle)	2024-03-14 11:10:15 +00:00
ajoubrel-ensae	6db4af19ce	Ajout debut cleaning target name	2024-03-13 22:24:38 +00:00
ajoubrel-ensae	1c9e7e1778	Stat desc 101	2024-03-13 13:25:21 +00:00
ajoubrel-ensae	96088d19cb	Update	2024-03-12 19:15:04 +00:00
tpique-ensae	35d9965671	added sales trend graphic	2024-03-12 15:16:28 +00:00
arevelle-ensae	03acb304f4	harmonize gender plot	2024-03-12 12:48:42 +00:00
tpique-ensae	8acc32de70	fixed graphic saving issues	2024-03-11 18:10:46 +00:00
tpique-ensae	99c7836182	added saving options for the graphics	2024-03-11 17:43:56 +00:00
tpique-ensae	ab83b7c20c	update stats desc music	2024-03-11 10:40:29 +00:00
arevelle-ensae	b35e04e307	run pipeline on music companies	2024-03-11 08:58:12 +00:00
arevelle-ensae	c502e2fd64	logistic baseline	2024-03-11 08:36:25 +00:00
ajoubrel-ensae	f0e9973533	Ajout modelisation et segmentation	2024-03-10 21:31:37 +00:00
frodrigue-ensae	e96e5a2b08	stat	2024-03-10 20:00:29 +00:00
frodrigue-ensae	921c2c796b	stat	2024-03-10 19:30:08 +00:00
frodrigue-ensae	858d1d2111	stat	2024-03-10 18:49:34 +00:00
frodrigue-ensae	a4d4803a1c	stat	2024-03-10 18:08:50 +00:00
frodrigue-ensae	75664a33d7	stat	2024-03-10 16:41:43 +00:00
ajoubrel-ensae	4aa781daf0	Correction problème méthodologique de construction	2024-03-10 16:04:16 +00:00
ajoubrel-ensae	f40ae6ead0	Correction erreur de construction des datasets de modelisation	2024-03-10 15:30:23 +00:00
arevelle-ensae	27ef78a486	fix pipeline	2024-03-10 12:30:10 +00:00
Alexis REVELLE	c2c749be3d	Merge pull request 'generalization' (#8 ) from generalization into main Reviewed-on: #8	2024-03-10 12:31:36 +01:00
tpique-ensae	aabf858c6c	update stats desc spectacles	2024-03-10 11:31:28 +00:00
arevelle-ensae	adc1da3e49	adjust pipeline	2024-03-10 11:30:57 +00:00
arevelle-ensae	58c7cac17f	work on pipeline	2024-03-10 10:09:53 +00:00
arevelle-ensae	198ef45247	Merge branch 'main' into generalization	2024-03-10 08:46:23 +00:00
ajoubrel-ensae	0eedea6e26	Changement dossier	2024-03-09 17:50:46 +00:00
ajoubrel-ensae	c6abfbe76e	Correction construction	2024-03-09 17:50:32 +00:00
ajoubrel-ensae	0b56e4e696	Ajout dossier musée + debut modelisation	2024-03-09 17:49:45 +00:00
ajoubrel-ensae	14922dccfa	Ajout des graphiques communs	2024-03-09 16:20:32 +00:00
ajoubrel-ensae	11e2e86583	Ajout et sauvegarde	2024-03-09 14:50:58 +00:00
arevelle-ensae	d3fa9f6870	prepare Pipeline	2024-03-08 13:48:38 +00:00
arevelle-ensae	bb684633d7	work on stat desc	2024-03-08 13:26:27 +00:00
arevelle-ensae	ced4747372	work on stat desc	2024-03-08 13:13:18 +00:00
tpique-ensae	3088e5f337	update stats desc	2024-03-08 09:30:12 +00:00
tpique-ensae	9a2316e843	update stats desc	2024-03-08 07:44:28 +00:00
ajoubrel-ensae	e426a86b55	Ancien fichier .py	2024-03-07 18:33:41 +00:00
arevelle-ensae	af84a57c54	Fix conflicts	2024-03-07 13:05:49 +00:00
Alexis REVELLE	a49d14853e	Merge pull request 'generalization' (#7 ) from generalization into main Reviewed-on: #7	2024-03-07 13:58:03 +01:00
arevelle-ensae	c7ca8c560e	fix conflicts	2024-03-07 12:57:21 +00:00
arevelle-ensae	20fa01647a	test train	2024-03-06 12:42:55 +00:00
arevelle-ensae	bed6a5c901	fix condition	2024-03-06 12:42:39 +00:00
arevelle-ensae	41f49edd1c	explore sport	2024-03-06 11:49:51 +00:00
arevelle-ensae	d8e2da70cb	fix path + test and train customer allocation'	2024-03-06 11:49:37 +00:00
arevelle-ensae	4503114435	work on stat	2024-03-06 10:56:52 +00:00
ajoubrel-ensae	3ec803d0a6	Modification avec 101	2024-03-05 19:26:41 +00:00
ajoubrel-ensae	1308484706	Fixation des dates de datasets	2024-03-05 17:46:06 +00:00
frodrigue-ensae	23551d29d2	stat	2024-03-05 14:50:46 +00:00
frodrigue-ensae	473afd9a89	ajout_indicateur	2024-03-05 14:37:29 +00:00
frodrigue-ensae	eaf1884bb6	anova	2024-03-05 13:36:03 +00:00
frodrigue-ensae	29ac99df14	test_anova	2024-03-05 13:34:43 +00:00
arevelle-ensae	de342a2f77	Merge branch 'main' into generalization	2024-03-05 11:27:41 +00:00
ajoubrel-ensae	da1f16d8ec	Ajout de statistiques sur les tags	2024-03-05 10:57:40 +00:00
frodrigue-ensae	1667f99a83	stat	2024-03-05 02:57:08 +00:00
frodrigue-ensae	2bf81015ac	stat	2024-03-05 02:51:39 +00:00
frodrigue-ensae	0052d4e78f	stat	2024-03-05 02:25:59 +00:00
frodrigue-ensae	8f5abf52fd	stat	2024-03-05 02:15:03 +00:00
frodrigue-ensae	1ec5b8743f	code	2024-03-05 01:44:01 +00:00
frodrigue-ensae	dbb90fb364	stat	2024-03-05 01:43:40 +00:00
frodrigue-ensae	66754f957e	base_test_train	2024-03-04 23:36:48 +00:00
ajoubrel-ensae	71a5cb2a3e	Ajout exploration	2024-03-04 22:30:25 +00:00
arevelle-ensae	688410299f	work on stat desc	2024-03-04 18:29:21 +00:00
arevelle-ensae	286bd9cb85	work on stat desc sport	2024-03-04 15:55:58 +00:00
ajoubrel-ensae	6e5383f594	Ajout observations des sites et graph sur proportion de billets gratuits	2024-03-03 22:33:35 +00:00
tpique-ensae	228e626ba8	stats + graphs campaigns_info et customer_plus	2024-03-03 08:32:45 +00:00
ajoubrel-ensae	51b7844358	Ajout statistiques descriptives sur les musées	2024-03-02 16:29:14 +00:00
frodrigue-ensae	020b092b04	stat	2024-03-02 13:05:48 +00:00
frodrigue-ensae	9dd9bd45e2	stat_desc_finale	2024-03-02 12:32:54 +00:00
frodrigue-ensae	169ce53c88	base_stat_desc	2024-03-02 12:05:51 +00:00
frodrigue-ensae	a3016ce78e	stat_des	2024-03-02 11:16:24 +00:00
frodrigue-ensae	3d4e661be9	stat	2024-03-02 10:37:44 +00:00
tpique-ensae	b54b726cf3	deplacement de fichiers	2024-03-02 08:59:32 +00:00
tpique-ensae	9cca31377f	déplacement de fichiers	2024-03-02 08:58:05 +00:00
Antoine JOUBREL	ccddaf2f12	Merge pull request 'generalization' (#6 ) from generalization into main Reviewed-on: #6	2024-02-29 20:26:00 +01:00
frodrigue-ensae	1d1594fc26	SPECT	2024-02-29 14:17:36 +00:00
arevelle-ensae	9c0aff85e5	fix path	2024-02-29 11:12:46 +00:00
arevelle-ensae	766463acaa	work on DS dataset	2024-02-29 10:14:16 +00:00
arevelle-ensae	b840b2403c	Generate global modelization datasets	2024-02-29 09:25:40 +00:00
arevelle-ensae	b71f842fe4	add prefix to customer_id	2024-02-29 09:09:04 +00:00
arevelle-ensae	979c0fe6a8	fix path	2024-02-29 08:33:05 +00:00
ajoubrel-ensae	2fabf98413	Quelques observations	2024-02-28 20:57:28 +00:00
ajoubrel-ensae	80a8642484	add fill NaN	2024-02-28 20:38:42 +00:00
frodrigue-ensae	12427e7b18	stat	2024-02-28 05:51:50 +00:00
frodrigue-ensae	9097a1194d	creation_kpi	2024-02-28 04:06:07 +00:00
frodrigue-ensae	bba4820dd8	base_spectacle	2024-02-28 02:31:01 +00:00
frodrigue-ensae	9a06cbe96f	added open function	2024-02-28 01:57:28 +00:00
frodrigue-ensae	e522615a8f	spectacle	2024-02-28 01:52:06 +00:00
ajoubrel-ensae	3c4f851d16	Correction erreur pour entreprise 101	2024-02-27 21:52:13 +00:00
ajoubrel-ensae	23981e3cbc	Modification de la partie product purchased : ajout start et end date, open + cleaning de la base ticket_1 de l'entreprise 101	2024-02-27 21:01:20 +00:00
ajoubrel-ensae	d0c980f788	Ajout exploration des tags	2024-02-26 21:47:36 +00:00
tpique-ensae	c9089de56c	ajout dossier + rangement notebooks	2024-02-26 15:51:31 +00:00
tpique-ensae	027ba3671e	ajout dossiers - rangement des notebooks	2024-02-26 15:49:40 +00:00
ajoubrel-ensae	283c675448	Ajout observation target	2024-02-25 22:53:10 +00:00
ajoubrel-ensae	716002bdcf	Exploration tags et target	2024-02-25 17:33:24 +00:00
ajoubrel-ensae	fa9c1c790e	add kpi function for customerplus	2024-02-25 17:31:14 +00:00
tpique-ensae	27e266c58e	ajout random forest + visu des perfs + pickles	2024-02-23 18:57:05 +00:00
Antoine JOUBREL	282d6cd8a5	Merge pull request 'generalization' (#5 ) from generalization into main Reviewed-on: #5	2024-02-22 23:01:53 +01:00
arevelle-ensae	79dc4f13ff	generate train and test dataset for all companies	2024-02-22 14:57:34 +00:00
arevelle-ensae	44fef6d618	investigate sport companies	2024-02-22 14:56:54 +00:00
arevelle-ensae	71c5d86679	handle na for supplier	2024-02-22 14:56:00 +00:00
ajoubrel-ensae	c26b5b11d8	Update	2024-02-21 22:08:33 +00:00
ajoubrel-ensae	29eafcc6b2	Merge branch 'data_construction'	2024-02-20 22:46:14 +00:00
frodrigue-ensae	1f0892434f	Valeur_manquante	2024-02-20 01:27:57 +00:00
frodrigue-ensae	2b4723e271	IDENTIFICATION	2024-02-20 01:27:30 +00:00
frodrigue-ensae	06d0223235	identification_entreprise_type_event	2024-02-20 01:03:23 +00:00