Merge branch 'main' into segment_value

2024-03-28 20:44:06 +00:00 · 2024-03-28 20:44:06 +00:00 · 3d6414728c
commit 3d6414728c
parent 56ee61e25f 7be4179de4
19 changed files with 2908 additions and 21144 deletions
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -1,5 +1,8 @@
-# Business Data Challenge - Team 1
+# Purpose of the script : Construction of training and test datasets for modelling by company
+# Input : KPI construction function and clean databases in the 0_Input folder
+# Output : Train and test datasets by compagnies 

+# Packages
 import pandas as pd
 import numpy as np
 import os
@ -9,12 +12,10 @@ import warnings
 from datetime import date, timedelta, datetime
 from sklearn.model_selection import train_test_split

-
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

-
 # Import KPI construction functions
 exec(open('0_KPI_functions.py').read())

@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')

 def dataset_construction(min_date, end_features_date, max_date, directory_path):
    
-    # Import customerplus
+    # Import of cleaned and merged datasets
    df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
    df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
    df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+    df_target_information = display_input_databases(directory_path, file_name = "target_information")
    
-    # if directory_path == "101":
-    #     df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
-    #     df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
-    
-    # Filtre de cohérence pour la mise en pratique de notre méthode
+    # Dates in datetime format
    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')

-    #Filtre de la base df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    # Filter for database df_campaigns_information
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
    
-    #Filtre de la base df_products_purchased_reduced
-    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+    # Filter for database df_products_purchased_reduced
+    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]

    print("Data filtering : SUCCESS")
    
-    # Fusion de l'ensemble et creation des KPI
+    # Building and merging features 

-    # KPI sur les campagnes publicitaires
+    # Campaigns features
    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date) 

-    # KPI sur le comportement d'achat
+    # Purchasing behavior features
    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)

-    # KPI sur les données socio-démographiques
+    # Socio-demographic features
    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)

+    # Targets features
+    df_targets_kpi = targets_KPI(df_target = df_target_information)
+    
    print("KPIs construction : SUCCESS")
    
-    # Fusion avec KPI liés au customer
+    # Merge - campaigns features
    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
    
    # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+    df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
+    df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
    
-    # Fusion avec KPI liés au comportement d'achat
-    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
+    # Merge - targets features
+    df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
    
    # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+    targets_columns = list(df_targets_kpi.columns)
+    targets_columns.remove('customer_id')
+
+    df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
+
+    # We standardise the number of targets closely linked to the company's operations
+    df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
+    
+    # Merge - purchasing behavior features
+    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
+    
+    # Fill NaN values
+    special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']    
+    simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
+
+    df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)

    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    print("Explanatory variable construction : SUCCESS")

    # 2. Construction of the explained variable 
-    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]

-    # Indicatrice d'achat
+    # Construction of the dependant variable
    df_products_purchased_to_predict['y_has_purchased'] = 1

    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    return dataset

 ## Exportation
-
+# Sectors
 companies = {'musee' : ['1', '2', '3', '4'], # , '101'
            'sport': ['5', '6', '7', '8', '9'],
            'musique' : ['10', '11', '12', '13', '14']}

+# Choosed sector
 type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 list_of_comp = companies[type_of_comp] 
-# Dossier d'exportation
-BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'

-# Create test dataset and train dataset for sport companies
-
-
-#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
-
-# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
+# Export folder
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'

+# Dates used for the construction of features and the dependant variable
 start_date = "2021-05-01"
 end_of_features = "2022-11-01"
 final_date = "2023-11-01"

-
+# Anonymous customer to be deleted from the datasets
 anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
                      '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
                     '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
@ -133,33 +146,23 @@ for company in list_of_comp:
    dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
                                        max_date = final_date, directory_path = company)    
    
-    # On retire le client anonyme
+    # Deletion of the anonymous customer
    dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]

-
-    # #train test set
-    # np.random.seed(42)
-    
-    # split_ratio = 0.7
-    # split_index = int(len(dataset) * split_ratio)
-    # dataset = dataset.sample(frac=1).reset_index(drop=True)
-    # dataset_train = dataset.iloc[:split_index]
-    # dataset_test = dataset.iloc[split_index:]
-
+    # Split between train and test
    dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
    
    # Dataset Test
-    # Exportation
+    # Export
    FILE_KEY_OUT_S3 = "dataset_test" + company +  ".csv"
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
    
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        dataset_test.to_csv(file_out, index = False)
    
-    print("Exportation dataset test : SUCCESS")
+    print("Export of dataset test : SUCCESS")

    # Dataset train
-
    # Export
    FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" 
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
@ -167,7 +170,7 @@ for company in list_of_comp:
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        dataset_train.to_csv(file_out, index = False)
        
-    print("Exportation dataset train : SUCCESS")
+    print("Export of dataset train : SUCCESS")


-print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
+print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
--- a/0_3_General_modelization_dataset.py
+++ b/0_3_General_modelization_dataset.py
@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')

 # functions
 def generate_test_set(type_of_comp):
-    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
    test_set = pd.DataFrame()
    for file in file_path_list:
        print(file)
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):


 def generate_train_set(type_of_comp):
-    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
    train_set = pd.DataFrame()
    for file in file_path_list:
        print(file)
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):


 type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
-BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'

 # create test and train datasets
 test_set = generate_test_set(type_of_comp)
--- a/0_4_Generate_stat_desc.py
+++ b/0_4_Generate_stat_desc.py
@ -0,0 +1,74 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import warnings
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+exec(open('0_KPI_functions.py').read())
+exec(open('utils_stat_desc.py').read())
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+companies = {'musee' : ['1', '2', '3', '4'], # , '101'
+            'sport': ['5', '6', '7', '8', '9'],
+            'musique' : ['10', '11', '12', '13', '14']}
+
+
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+list_of_comp = companies[type_of_activity] 
+
+# Load files
+customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
+
+# Identify anonymous customer for each company and remove them from our datasets
+outlier_list = outlier_detection(tickets, list_of_comp)
+
+# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
+customer_valid_list = valid_customer_detection(products, campaigns_brut)
+
+databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
+
+for dataset in databases:
+    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
+    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
+    #print(f'shape of {dataset} : ', dataset.shape)
+
+# Identify customer who bought during the period of y
+customer_target_period = identify_purchase_during_target_periode(products)
+customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
+
+# Generate graph and automatically saved them in the bucket
+compute_nb_clients(customer, type_of_activity)
+
+#maximum_price_paid(customer, type_of_activity)
+
+target_proportion(customer, type_of_activity)
+
+mailing_consent(customer, type_of_activity)
+
+mailing_consent_by_target(customer)
+
+gender_bar(customer, type_of_activity)
+
+country_bar(customer, type_of_activity)
+
+lazy_customer_plot(campaigns_kpi, type_of_activity)
+
+campaigns_effectiveness(customer, type_of_activity)
+
+sale_dynamics(products, campaigns_brut, type_of_activity)
+
+tickets_internet(tickets, type_of_activity)
+
+already_bought_online(tickets, type_of_activity)
+
+box_plot_price_tickets(tickets, type_of_activity)
+
+target_description(targets, type_of_activity)
--- a/0_5_Machine_Learning.py
+++ b/0_5_Machine_Learning.py
@ -0,0 +1,87 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
+from sklearn.utils import class_weight
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.calibration import calibration_curve
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
+from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
+import pickle
+import warnings
+
+
+exec(open('utils_ml.py').read())
+
+warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
+warnings.filterwarnings("ignore", category=DataConversionWarning)
+
+# choose the type of companies for which you want to run the pipeline
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+# choose the type of model
+type_of_model = input('Choisissez le type de model : basique ? premium ?')
+
+# load train and test set
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
+
+X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
+
+print("Shape train : ", X_train.shape)
+print("Shape test : ", X_test.shape)
+
+# processing
+
+weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
+                                            y = y_train['y_has_purchased'])
+
+weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
+
+preproc = preprocess(type_of_model, type_of_activity)
+
+# Object for storing results
+model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
+
+# Naive Bayes
+model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Naive Bayes : Done")
+
+# Logistic Regression
+model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
+print("Logistic : Done")
+
+model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Logistic CV : Done")
+
+# Random Forest
+model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Random Forest : Done")
+
+model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
+print("Random Forest CV: Done")
+
+# Save result
+save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
--- a/0_6_Segmentation.py
+++ b/0_6_Segmentation.py
@ -0,0 +1,40 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import pickle
+import warnings
+
+
+exec(open('utils_segmentation.py').read())
+warnings.filterwarnings('ignore')
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+# choose the type of companies for which you want to run the pipeline
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+
+# load test set
+dataset_test = load_test_file(type_of_activity)
+
+# Load Model 
+model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
+
+# Processing
+X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
+            'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner',
+            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
+
+y_test = dataset_test[['y_has_purchased']]
+
+# Prediction
+y_pred_prob = model.predict_proba(X_test)[:, 1]
+
+# Add probability to dataset_test
+dataset_test['Probability_to_buy'] = y_pred_prob
+print('probability added to dataset_test')
+print(dataset_test.head())
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
    cleaning_date(customerplus_copy, 'last_visiting_date')
    
    # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)

    return customerplus_copy
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
    return df       

-def campaigns_kpi_function(campaigns_information = None, max_date = None):
+def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
    
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
    
    # Temps d'ouverture moyen (en minutes)
-    campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
+    campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    
    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):

    return campaigns_reduced

-
 def tickets_kpi_function(tickets_information = None):

    tickets_information_copy = tickets_information.copy()
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
    })
    gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
    customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
+    customerplus_clean.drop(columns = "gender", inplace = True)
+

    # Age
    customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
    customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
    customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
    customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
+    # customerplus_clean.drop(columns = "age", inplace = True)

    # Consentement au mailing
    customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
    
    # Indicatrice si individue vit en France
    customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
+    # customerplus_clean.drop(columns = "country", inplace = True)
    
    customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
+    # customerplus_clean.drop(columns = "profession", inplace = True)

    customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
+    # customerplus_clean.drop(columns = "zipcode", inplace = True)
    
-    # Dummy if the customer has a structure id (tags)
-    # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
    
    return customerplus_clean

+def targets_KPI(df_target = None):
+    
+    df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
+
+    # Target name cotegory musees / 
+    df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
+    df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
+    df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
+    df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
+    df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
+    df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
+    df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
+    
+    # Target name category for sport compagnies
+    df_target['target_abonne'] = ((
+                            df_target['target_name']
+                            .str.contains('|'.join(['abo', 'adh']), case=False)
+                            & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
+                            ).astype(int))
+    
+    df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
+    
+    target_agg = df_target.groupby('customer_id').agg(
+        nb_targets=('target_name', 'nunique')  # Utilisation de tuples pour spécifier les noms de colonnes
+        # all_targets=('target_name', concatenate_names),
+        # all_target_types=('target_type_name', concatenate_names)
+        ).reset_index()
+    
+    target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
+    
+    return target_agg
+    
--- a/Descriptive_statistics/debug.ipynb
+++ b/Descriptive_statistics/debug.ipynb
--- a/Descriptive_statistics/generate_stat_desc.py
+++ b/Descriptive_statistics/generate_stat_desc.py
@ -0,0 +1,68 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import warnings
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+exec(open('../0_KPI_functions.py').read())
+exec(open('plot.py').read())
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+companies = {'musee' : ['1', '2', '3', '4'], # , '101'
+            'sport': ['5'],
+            'musique' : ['10', '11', '12', '13', '14']}
+
+
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+list_of_comp = companies[type_of_activity] 
+
+# Load files
+customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
+
+# Identify anonymous customer for each company and remove them from our datasets
+outlier_list = outlier_detection(tickets, list_of_comp)
+
+# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
+customer_valid_list = valid_customer_detection(products, campaigns_brut)
+
+databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
+
+for dataset in databases:
+    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
+    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
+    #print(f'shape of {dataset} : ', dataset.shape)
+
+# Identify customer who bought during the period of y
+customer_target_period = identify_purchase_during_target_periode(products)
+customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
+
+# Generate graph and automatically saved them in the bucket
+compute_nb_clients(customer, type_of_activity)
+
+maximum_price_paid(customer, type_of_activity)
+
+mailing_consent(customer, type_of_activity)
+
+mailing_consent_by_target(customer)
+
+gender_bar(customer, type_of_activity)
+
+country_bar(customer, type_of_activity)
+
+lazy_customer_plot(campaigns_kpi, type_of_activity)
+
+#campaigns_effectiveness(customer, type_of_activity)
+
+sale_dynamics(products, campaigns_brut, type_of_activity)
+
+tickets_internet(tickets, type_of_activity)
+
+box_plot_price_tickets(tickets, type_of_activity)
--- a/Descriptive_statistics/plot.py
+++ b/Descriptive_statistics/plot.py
@ -0,0 +1,328 @@
+import pandas as pd
+import os
+import s3fs
+import io
+import warnings
+from datetime import date, timedelta, datetime
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import seaborn as sns
+
+
+def load_files(nb_compagnie):
+    customer = pd.DataFrame()
+    campaigns_brut = pd.DataFrame()
+    campaigns_kpi = pd.DataFrame()
+    products = pd.DataFrame()
+    tickets = pd.DataFrame()
+    
+    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
+    for directory_path in nb_compagnie:
+        df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
+        df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
+        df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+        df_target_information = display_databases(directory_path, file_name = "target_information")
+        
+        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) 
+        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
+        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
+    
+            
+    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
+        df_tickets_kpi["number_company"]=int(directory_path)
+        df_campaigns_brut["number_company"]=int(directory_path)
+        df_campaigns_kpi["number_company"]=int(directory_path)
+        df_customerplus_clean["number_company"]=int(directory_path)
+        df_target_information["number_company"]=int(directory_path)
+    
+    # Traitement des index
+        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
+        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
+        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
+        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
+        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
+    
+    # Concaténation
+        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
+        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
+        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
+        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
+        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
+
+    return customer, campaigns_kpi, campaigns_brut, tickets, products
+
+
+def save_file_s3(File_name, type_of_activity):
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png')
+    image_buffer.seek(0)
+    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+
+def outlier_detection(tickets, company_list, show_diagram=False):
+
+    outlier_list = list()
+    
+    for company in company_list:
+        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
+        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
+        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
+        
+        total_amount_share_index = total_amount_share.set_index('customer_id')
+        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
+        #print('df circulaire : ', df_circulaire.head())
+        top = df_circulaire[:1]
+        #print('top : ', top)
+        outlier_list.append(top.index[0])
+        rest = df_circulaire[1:]
+    
+        rest_sum = rest.sum()
+        
+        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
+        
+        if show_diagram:
+            plt.figure(figsize=(3, 3))
+            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
+            plt.axis('equal')
+            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
+            plt.show()
+    return outlier_list
+
+
+def valid_customer_detection(products, campaigns_brut):
+    products_valid = products[products['purchase_date']>="2021-05-01"]
+    consumer_valid_product = products_valid['customer_id'].to_list()
+
+    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
+    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
+
+    consumer_valid = consumer_valid_product + consumer_valid_campaigns
+    return consumer_valid 
+
+
+def identify_purchase_during_target_periode(products):
+    products_target_period = products[(products['purchase_date']>="2022-11-01")
+    & (products['purchase_date']<="2023-11-01")]
+    customer_target_period = products_target_period['customer_id'].to_list()
+    return customer_target_period
+
+    
+def remove_elements(lst, elements_to_remove):
+    return ''.join([x for x in lst if x not in elements_to_remove])
+    
+
+def compute_nb_clients(customer, type_of_activity):
+    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
+    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
+
+    plt.xlabel('Company')
+    plt.ylabel("Number of clients (thousands)")
+    plt.title(f"Number of clients for {type_of_activity}")
+    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
+    plt.show()
+    save_file_s3("nb_clients_", type_of_activity)
+
+
+def maximum_price_paid(customer, type_of_activity):
+    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
+    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
+    
+    plt.xlabel('Company')
+    plt.ylabel("Maximal price of a ticket Prix")
+    plt.title(f"Maximal price of a ticket for {type_of_activity}")
+    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
+    plt.show()
+    save_file_s3("Maximal_price_", type_of_activity)
+
+
+def mailing_consent(customer, type_of_activity):
+    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
+
+    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
+
+    plt.xlabel('Company')
+    plt.ylabel('Consent')
+    plt.title(f'Consent of mailing for {type_of_activity}')
+    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
+    plt.show()
+    save_file_s3("mailing_consent_", type_of_activity)
+
+
+def mailing_consent_by_target(customer):
+    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
+    # Création du barplot groupé
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    categories = df_graph["number_company"].unique()
+    bar_width = 0.35
+    bar_positions = np.arange(len(categories))
+    
+    # Grouper les données par label et créer les barres groupées
+    for label in df_graph["has_purchased_target_period"].unique():
+        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
+        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
+    
+        label_printed = "purchased" if label else "no purchase"
+        ax.bar(bar_positions, values, bar_width, label=label_printed)
+    
+        # Mise à jour des positions des barres pour le prochain groupe
+        bar_positions = [pos + bar_width for pos in bar_positions]
+    
+    # Ajout des étiquettes, de la légende, etc.
+    ax.set_xlabel('Company')
+    ax.set_ylabel('Consent')
+    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
+    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
+    ax.set_xticklabels(categories)
+    ax.legend()
+    
+    # Affichage du plot
+    plt.show()
+    save_file_s3("mailing_consent_target_", type_of_activity)
+
+
+def gender_bar(customer, type_of_activity):
+    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
+    
+    # Création du barplot
+    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
+    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
+            bottom = company_genders["gender_male"], label = "Femme")
+    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
+            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
+    
+    plt.xlabel('Company')
+    plt.ylabel("Gender")
+    plt.title(f"Gender of Customer for {type_of_activity}")
+    plt.legend()
+    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
+    plt.show()
+    save_file_s3("gender_bar_", type_of_activity)
+    
+
+def country_bar(customer, type_of_activity):
+    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
+    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
+    
+    plt.xlabel('Company')
+    plt.ylabel("Share of French Customer")
+    plt.title(f"Share of French Customer for {type_of_activity}")
+    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
+    plt.show()
+    save_file_s3("country_bar_", type_of_activity)
+
+
+def lazy_customer_plot(campaigns_kpi, type_of_activity):
+    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
+    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
+    
+    plt.xlabel('Company')
+    plt.ylabel("Share of Customers who did not open mail")
+    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
+    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
+    plt.show()
+    save_file_s3("lazy_customer_", type_of_activity)
+
+
+def campaigns_effectiveness(customer, type_of_activity):
+
+    campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
+
+    plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
+    
+    plt.xlabel('Company')
+    plt.ylabel("Number of Customers (thousands)")
+    plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
+    plt.legend()
+    plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
+    plt.show()
+    save_file_s3("campaigns_effectiveness_", type_of_activity)
+
+
+def sale_dynamics(products, campaigns_brut, type_of_activity):
+    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
+    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
+    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
+    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
+    
+    # Mois du premier mails
+    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
+    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
+    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
+    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
+    
+    # Fusion 
+    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
+                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
+    
+    # Mois à partir duquel le client est considere comme connu
+    
+    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
+    
+    # Nombre de commande par mois
+    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
+    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
+    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
+    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
+    
+    # Nombre de commande par mois par type de client
+    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
+    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
+    
+    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
+    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
+    
+    # Graphique en nombre de commande
+    purchases_graph = nb_purchases_graph
+    
+    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
+    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
+    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
+    
+    
+    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
+    
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
+            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
+    
+    
+    # commande pr afficher slt
+    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
+    
+    plt.xlabel('Month')
+    plt.ylabel("Number of Sales")
+    plt.title(f"Number of Sales for {type_of_activity}")
+    plt.legend()
+    plt.show()
+    save_file_s3("sale_dynamics_", type_of_activity)
+
+
+def tickets_internet(tickets, type_of_activity):
+    nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
+    nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
+
+    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["Share_ticket_internet"])
+    
+    plt.xlabel('Company')
+    plt.ylabel("Share of Tickets Bought Online")
+    plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
+    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
+    plt.show()
+    save_file_s3("tickets_internet_", type_of_activity)
+    
+
+def box_plot_price_tickets(tickets, type_of_activity):
+    price_tickets = tickets[(tickets['total_amount'] > 0)]
+    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
+    plt.title(f"Box plot of price tickets for {type_of_activity}")
+    plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
+    plt.show()
+    save_file_s3("box_plot_price_tickets_", type_of_activity)
+
+
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
--- a/Musee/1_Descriptive_Statistics_Museum.ipynb
+++ b/Musee/1_Descriptive_Statistics_Museum.ipynb
--- a/Musee/2_Modelization_musee.ipynb
+++ b/Musee/2_Modelization_musee.ipynb
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
--- a/README.md
+++ b/README.md
@ -0,0 +1,33 @@
+# Business data challenge 2023-2024 | ENSAE Paris
+# Arenametrix : customer segmentation
+
+## Team 1 : 
+
+* Antoine JOUBREL
+* Alexis REVELLE
+* Fanta RODRIGUE
+* Thomas PIQUÉ
+
+
+## Coaches : 
+
+* Elia LAPENTA 
+* Michael VISSER
+
+
+### Description of the problematic
+The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies). 
+
+### Our approach
+We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
+As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year. 
+
+### How run the code 
+- run 0_1_Input_cleaning.py to clean the raw data and generate dataframes that will be used to build datasets with insightful variables.
+- run 0_2_Dataset_construction.py. 
+- run 0_3_General_modelization_dataset.py to generate test and train sets for the 3 types of activities.
+- run the script 0_4_Generate_stat_desc.py to generate graphics describing the data
+- run  0_5_Machine_Learning.py. 3 ML models will be fitted on the data, and results will be exported for all 3 types of activities
+- run 0_6_Segmentation.py. The test set will be fitted with the optimal parameters computed previously. That will allow to compute a propensity score (probability of a future purchase). Segmentation is performed according to the scores provided. This scripts exports graphics describing the marketing personae associated to the segments as well as their business value.
+- run 0_7_CA_segment.py. The scores will be adjusted to better fit the overall probability of a purchase. This score adjusted is used to estimate the number of tickets sold and the revenue generated during the incoming year.
+
--- a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
+++ b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
   "id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
   "metadata": {},
   "outputs": [],
@ -115,9 +115,9 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_train = pd.read_csv(file_in, sep=\",\")\n",
-      "/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_test = pd.read_csv(file_in, sep=\",\")\n"
     ]
    }
@ -228,7 +228,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "6224fd31-c190-4168-b395-e0bf5806d79d",
   "metadata": {},
   "outputs": [
@ -238,7 +238,7 @@
       "{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -254,7 +254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 10,
   "id": "4680f202-979e-483f-89b8-9df877203bcf",
   "metadata": {},
   "outputs": [
@ -265,7 +265,7 @@
       "       0.54812838])"
      ]
     },
-     "execution_count": 58,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -282,7 +282,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 11,
   "id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
   "metadata": {},
   "outputs": [
@ -311,7 +311,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 258,
+   "execution_count": 12,
   "id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
   "metadata": {},
   "outputs": [
@ -650,7 +650,7 @@
       "[354365 rows x 17 columns]"
      ]
     },
-     "execution_count": 258,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -662,7 +662,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 259,
+   "execution_count": 13,
   "id": "648fb542-0186-493d-b274-be2c26a11967",
   "metadata": {},
   "outputs": [],
@ -677,7 +677,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 260,
+   "execution_count": 14,
   "id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
   "metadata": {},
   "outputs": [
@ -1016,7 +1016,7 @@
       "[354365 rows x 17 columns]"
      ]
     },
-     "execution_count": 260,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1510,12 +1510,14 @@
    "\n",
    "- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
    "- ajouter un intercept\n",
-    "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
+    "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
+    "\n",
+    "#### A recopier dans la pipeline -> section 2 bis"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 266,
+   "execution_count": 15,
   "id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
   "metadata": {},
   "outputs": [
@ -1817,7 +1819,7 @@
       "[354365 rows x 15 columns]"
      ]
     },
-     "execution_count": 266,
+     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1831,7 +1833,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 267,
+   "execution_count": 16,
   "id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
   "metadata": {},
   "outputs": [
@ -1847,8 +1849,8 @@
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                          Logit   Df Residuals:                   354350\n",
      "Method:                           MLE   Df Model:                           14\n",
-      "Date:                Fri, 15 Mar 2024   Pseudo R-squ.:                  0.2112\n",
-      "Time:                        10:07:29   Log-Likelihood:                -83135.\n",
+      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2112\n",
+      "Time:                        07:57:46   Log-Likelihood:                -83135.\n",
      "converged:                       True   LL-Null:                   -1.0540e+05\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
@ -1887,7 +1889,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 268,
+   "execution_count": 17,
   "id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
   "metadata": {},
   "outputs": [],
@ -1908,7 +1910,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 269,
+   "execution_count": 18,
   "id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
   "metadata": {},
   "outputs": [
@ -2210,7 +2212,7 @@
       "[354365 rows x 15 columns]"
      ]
     },
-     "execution_count": 269,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2221,7 +2223,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 289,
+   "execution_count": 19,
   "id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
   "metadata": {},
   "outputs": [
@ -2237,8 +2239,8 @@
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                          Logit   Df Residuals:                   354350\n",
      "Method:                           MLE   Df Model:                           14\n",
-      "Date:                Fri, 15 Mar 2024   Pseudo R-squ.:                  0.2112\n",
-      "Time:                        10:26:14   Log-Likelihood:                -83135.\n",
+      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2112\n",
+      "Time:                        07:58:13   Log-Likelihood:                -83135.\n",
      "converged:                       True   LL-Null:                   -1.0540e+05\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
@ -2276,12 +2278,226 @@
    "print(result.summary())"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimization terminated successfully    (Exit mode 0)\n",
+      "            Current function value: 0.23562928627877766\n",
+      "            Iterations: 240\n",
+      "            Function evaluations: 243\n",
+      "            Gradient evaluations: 240\n",
+      "const                   0.000000e+00\n",
+      "nb_tickets              2.477006e-01\n",
+      "nb_purchases            1.636902e-03\n",
+      "total_amount            8.839088e-04\n",
+      "nb_suppliers            1.906550e-65\n",
+      "vente_internet_max      0.000000e+00\n",
+      "purchase_date_min       0.000000e+00\n",
+      "purchase_date_max       0.000000e+00\n",
+      "nb_tickets_internet    7.232680e-112\n",
+      "is_email_true           8.202187e-08\n",
+      "opt_in                  0.000000e+00\n",
+      "gender_female          1.624424e-170\n",
+      "gender_male            4.961315e-220\n",
+      "nb_campaigns           6.276733e-205\n",
+      "nb_campaigns_opened    2.228531e-176\n",
+      "dtype: float64\n",
+      "                           Logit Regression Results                           \n",
+      "==============================================================================\n",
+      "Dep. Variable:                      y   No. Observations:               354365\n",
+      "Model:                          Logit   Df Residuals:                   354350\n",
+      "Method:                           MLE   Df Model:                           14\n",
+      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2111\n",
+      "Time:                        10:45:37   Log-Likelihood:                -83152.\n",
+      "converged:                       True   LL-Null:                   -1.0540e+05\n",
+      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
+      "=======================================================================================\n",
+      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
+      "---------------------------------------------------------------------------------------\n",
+      "const                  -3.1162      0.081    -38.383      0.000      -3.275      -2.957\n",
+      "nb_tickets             -0.0136      0.012     -1.156      0.248      -0.037       0.009\n",
+      "nb_purchases           -0.0385      0.012     -3.149      0.002      -0.063      -0.015\n",
+      "total_amount            0.0588      0.018      3.325      0.001       0.024       0.094\n",
+      "nb_suppliers            0.1638      0.010     17.085      0.000       0.145       0.183\n",
+      "vente_internet_max     -0.8651      0.011    -82.182      0.000      -0.886      -0.844\n",
+      "purchase_date_min       0.5790      0.015     39.391      0.000       0.550       0.608\n",
+      "purchase_date_max      -1.4088      0.016    -89.101      0.000      -1.440      -1.378\n",
+      "nb_tickets_internet     0.2857      0.013     22.475      0.000       0.261       0.311\n",
+      "is_email_true           0.4224      0.079      5.363      0.000       0.268       0.577\n",
+      "opt_in                 -1.9818      0.019   -106.856      0.000      -2.018      -1.945\n",
+      "gender_female           0.6553      0.024     27.835      0.000       0.609       0.701\n",
+      "gender_male             0.7578      0.024     31.663      0.000       0.711       0.805\n",
+      "nb_campaigns            0.2835      0.009     30.547      0.000       0.265       0.302\n",
+      "nb_campaigns_opened     0.2061      0.007     28.315      0.000       0.192       0.220\n",
+      "=======================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 2.bis on fait de même pour un modèle logit avec pénalité \n",
+    "# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
+    "\n",
+    "# sans spécification, le alpha optimal est déterminé par cross validation\n",
+    "# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
+    "# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
+    "\n",
+    "result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
+    "\n",
+    "print(result.pvalues)\n",
+    "print(result.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
+   "metadata": {},
+   "source": [
+    "### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 247,
+   "id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define a function to generate summaries of logit model\n",
+    "\n",
+    "def model_logit(X, y, weight_dict, add_constant=False) :\n",
+    "    # Generate sample weights based on class weights computed earlier\n",
+    "    sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
+    "\n",
+    "    if add_constant :\n",
+    "        X_const = sm.add_constant(X)\n",
+    "    else :\n",
+    "        X_const = X\n",
+    "   \n",
+    "    # Use GLM from statsmodels with Binomial family for logistic regression\n",
+    "    model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
+    "    \n",
+    "    # fit without penalty\n",
+    "    result = model.fit()\n",
+    "\n",
+    "    result_summary = result.summary()\n",
+    "    \n",
+    "    return result_summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 248,
+   "id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                 Generalized Linear Model Regression Results                  \n",
+      "==============================================================================\n",
+      "Dep. Variable:                      y   No. Observations:               354365\n",
+      "Model:                            GLM   Df Residuals:                   354350\n",
+      "Model Family:                Binomial   Df Model:                           14\n",
+      "Link Function:                  Logit   Scale:                          1.0000\n",
+      "Method:                          IRLS   Log-Likelihood:            -1.8693e+05\n",
+      "Date:                Thu, 21 Mar 2024   Deviance:                   3.7387e+05\n",
+      "Time:                        13:19:33   Pearson chi2:                 1.97e+16\n",
+      "No. Iterations:                   100   Pseudo R-squ. (CS):             0.2820\n",
+      "Covariance Type:            nonrobust                                         \n",
+      "=======================================================================================\n",
+      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
+      "---------------------------------------------------------------------------------------\n",
+      "const                  -1.3943      0.062    -22.456      0.000      -1.516      -1.273\n",
+      "nb_tickets             -0.3312      0.016    -20.967      0.000      -0.362      -0.300\n",
+      "nb_purchases            0.9258      0.098      9.491      0.000       0.735       1.117\n",
+      "total_amount            0.8922      0.042     21.393      0.000       0.810       0.974\n",
+      "nb_suppliers            0.2238      0.007     32.137      0.000       0.210       0.237\n",
+      "vente_internet_max     -0.7453      0.007   -100.473      0.000      -0.760      -0.731\n",
+      "purchase_date_min       0.7123      0.015     46.063      0.000       0.682       0.743\n",
+      "purchase_date_max      -1.3328      0.017    -79.297      0.000      -1.366      -1.300\n",
+      "nb_tickets_internet     0.1784      0.011     16.366      0.000       0.157       0.200\n",
+      "is_email_true           0.8635      0.061     14.086      0.000       0.743       0.984\n",
+      "opt_in                 -1.7487      0.010   -174.737      0.000      -1.768      -1.729\n",
+      "gender_female           0.8084      0.013     60.803      0.000       0.782       0.835\n",
+      "gender_male             0.8731      0.014     64.332      0.000       0.846       0.900\n",
+      "nb_campaigns            0.1751      0.006     31.101      0.000       0.164       0.186\n",
+      "nb_campaigns_opened     0.2962      0.005     54.145      0.000       0.285       0.307\n",
+      "=======================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# with the function\n",
+    "\n",
+    "# 1. logit with weights\n",
+    "results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
+    "print(results_logit_weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 252,
+   "id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                 Generalized Linear Model Regression Results                  \n",
+      "==============================================================================\n",
+      "Dep. Variable:                      y   No. Observations:               354365\n",
+      "Model:                            GLM   Df Residuals:                   354350\n",
+      "Model Family:                Binomial   Df Model:                           14\n",
+      "Link Function:                  Logit   Scale:                          1.0000\n",
+      "Method:                          IRLS   Log-Likelihood:                -83141.\n",
+      "Date:                Thu, 21 Mar 2024   Deviance:                   1.6628e+05\n",
+      "Time:                        13:20:06   Pearson chi2:                 4.52e+15\n",
+      "No. Iterations:                     8   Pseudo R-squ. (CS):             0.1180\n",
+      "Covariance Type:            nonrobust                                         \n",
+      "=======================================================================================\n",
+      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
+      "---------------------------------------------------------------------------------------\n",
+      "const                  -3.6025      0.091    -39.755      0.000      -3.780      -3.425\n",
+      "nb_tickets             -0.0230      0.010     -2.191      0.028      -0.044      -0.002\n",
+      "nb_purchases           -0.0519      0.014     -3.609      0.000      -0.080      -0.024\n",
+      "total_amount            0.0799      0.021      3.841      0.000       0.039       0.121\n",
+      "nb_suppliers            0.1694      0.010     17.662      0.000       0.151       0.188\n",
+      "vente_internet_max     -0.8764      0.011    -82.965      0.000      -0.897      -0.856\n",
+      "purchase_date_min       0.5881      0.015     39.936      0.000       0.559       0.617\n",
+      "purchase_date_max      -1.4197      0.016    -89.592      0.000      -1.451      -1.389\n",
+      "nb_tickets_internet     0.2895      0.013     22.652      0.000       0.264       0.315\n",
+      "is_email_true           0.8651      0.088      9.797      0.000       0.692       1.038\n",
+      "opt_in                 -1.9976      0.019   -107.305      0.000      -2.034      -1.961\n",
+      "gender_female           0.7032      0.024     29.395      0.000       0.656       0.750\n",
+      "gender_male             0.8071      0.024     33.201      0.000       0.759       0.855\n",
+      "nb_campaigns            0.2850      0.009     30.633      0.000       0.267       0.303\n",
+      "nb_campaigns_opened     0.2061      0.007     28.245      0.000       0.192       0.220\n",
+      "=======================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 2. logit without weights\n",
+    "\n",
+    "results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
+    "print(results_logit)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "36c5e770-72b3-4482-ad61-45b511a11f06",
   "metadata": {},
   "source": [
-    "## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
+    "## graphique LASSO - quelles variables sont importantes dans le modèle ? "
   ]
  },
  {
--- a/utils_ml.py
+++ b/utils_ml.py
@ -0,0 +1,410 @@
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import io
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
+from sklearn.utils import class_weight
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.calibration import calibration_curve
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
+from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
+
+import pickle
+import warnings
+
+
+def load_train_test(type_of_activity, type_of_model):
+    BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
+    File_path_train = BUCKET + "/Train_set.csv"
+    File_path_test = BUCKET + "/Test_set.csv"
+    
+    with fs.open( File_path_train, mode="rb") as file_in:
+        dataset_train = pd.read_csv(file_in, sep=",")
+        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
+
+    with fs.open(File_path_test, mode="rb") as file_in:
+        dataset_test = pd.read_csv(file_in, sep=",")
+        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
+
+    if type_of_model=='premium':
+        dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
+        dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
+        dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
+        dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
+    return dataset_train, dataset_test
+
+
+def save_file_s3(File_name, type_of_activity, type_of_model, model):
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png')
+    image_buffer.seek(0)
+    FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+
+def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
+    if model_path:
+        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
+    else:
+        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
+    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+        result_set.to_csv(file_out, index = False)
+
+
+def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
+    model_bytes = pickle.dumps(classifier)
+    FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
+        f.write(model_bytes)
+    
+
+def compute_recall(group):
+    return recall_score(group['y_has_purchased'], group['prediction'])
+    
+
+def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
+    test = dataset_test.copy()
+    test['prediction'] = y_pred
+    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
+    recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
+    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
+
+
+def features_target_split(dataset_train, dataset_test):
+    features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
+                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
+                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
+                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
+                  'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
+                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
+                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
+                 'target_jeune', 'target_abonne']
+    X_train = dataset_train[features_l]
+    y_train = dataset_train[['y_has_purchased']]
+
+    X_test = dataset_test[features_l]
+    y_test = dataset_test[['y_has_purchased']]
+    return X_train, X_test, y_train, y_test
+
+
+def preprocess(type_of_model,  type_of_activity):
+
+    numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
+                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
+                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
+                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
+
+    binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
+                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
+                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
+    
+    if type_of_activity=='musee':
+        numeric_features.remove('time_to_open')
+    
+    if type_of_model=='premium':
+        if type_of_activity=='musique':
+            binary_features.extend(['target_optin', 'target_newsletter'])
+        elif type_of_activity=='sport':
+            binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
+        else:
+             binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
+       
+
+    numeric_transformer = Pipeline(steps=[
+        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
+        ("scaler", StandardScaler()) 
+    ])
+
+    binary_transformer = Pipeline(steps=[
+        ("imputer", SimpleImputer(strategy="most_frequent")),  
+    ])
+    preproc = ColumnTransformer(
+        transformers=[
+            ("num", numeric_transformer, numeric_features),
+            ("bin", binary_transformer, binary_features)
+        ]
+    )
+    return preproc
+
+
+def draw_confusion_matrix(y_test, y_pred, model):
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title('Confusion Matrix')
+    plt.show()
+    save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
+
+
+def draw_roc_curve(X_test, y_pred_prob, model):
+    # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
+    
+    # Calcul de l'aire sous la courbe ROC (AUC)
+    roc_auc = auc(fpr, tpr)
+    
+    plt.figure(figsize = (14, 8))
+    plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
+    plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
+    plt.grid(color='gray', linestyle='--', linewidth=0.5)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve", size=18)
+    plt.legend(loc="lower right")
+    plt.show()
+    save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
+
+
+def draw_calibration_curve(X_test, y_pred_prob, model):
+    frac_pos, mean_pred = calibration_curve(y_test,  y_pred_prob, n_bins=10)
+
+    # Plot the calibration curve
+    plt.plot(mean_pred, frac_pos, 's-', label=model)
+    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+    plt.xlabel('Mean predicted value')
+    plt.ylabel('Fraction of positive predictions')
+    plt.title("Calibration Curve")
+    plt.legend()
+    plt.show()
+    save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
+
+
+def draw_features_importance(pipeline, model, randomF = False):
+    if randomF:
+        coefficients = pipeline.named_steps[model].feature_importances_
+    else: 
+        coefficients = pipeline.named_steps[model].coef_[0]
+    
+    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
+    # Tracer l'importance des caractéristiques
+    plt.figure(figsize=(12, 8))
+    plt.barh(feature_names, coefficients, color='skyblue')
+    plt.xlabel("Features' Importance")
+    plt.ylabel('Caractéristiques')
+    plt.title("Features' Importance")
+    plt.grid(True)
+    plt.show()
+    save_file_s3("Features_", type_of_activity, type_of_model, model)
+
+
+def draw_prob_distribution(y_pred_prob, model):
+    plt.figure(figsize=(10, 8))
+    plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
+    
+    plt.xlim(0, 1)
+    plt.ylim(0, None)
+    
+    plt.title('Histogramme des probabilités pour la classe 1')
+    plt.xlabel('Probability')
+    plt.ylabel('Frequency')
+    plt.grid(True)
+    plt.show()
+    save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
+
+
+def draw_prob_distribution_companies(y_pred_prob, model):
+    test = dataset_test.copy()
+    test['probability to buy'] = y_pred_prob
+    test['company'] = test['customer_id'].str.split('_', expand=True)[0]
+    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
+             stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
+    plt.xlim(0, 1)
+    plt.ylim(0, None)
+    plt.title('Histogram of probabilities for class 1 by company')
+    plt.xlabel('Probability')
+    plt.ylabel('Frequency')
+    plt.grid(True)
+    plt.show()
+    save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
+
+
+
+def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
+                                  max_iter=5000, n_jobs=-1))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "LogisticRegression_Benchmark"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
+
+
+def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
+    y_train = y_train['y_has_purchased']
+    param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
+              'LogisticRegression_cv__penalty': ['l1', 'l2'],
+               'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} 
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))  
+])
+    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
+                          n_jobs=-1)
+
+    grid_search.fit(X_train, y_train)
+    y_pred = grid_search.predict(X_test)
+    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
+    best_pipeline = grid_search.best_estimator_
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "LogisticRegression_cv"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(best_pipeline, 'LogisticRegression_cv')
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
+    return model_result
+
+
+def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('randomF', RandomForestClassifier(class_weight = weight_dict,
+                                  n_jobs=-1))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "randomF"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(pipeline, 'randomF', randomF=True)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
+
+
+def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
+    y_train = y_train['y_has_purchased']
+    param_grid = {
+    'randomF_cv__n_estimators': [100, 300],
+    'randomF_cv__max_features': ['sqrt', 'log2'],
+    'randomF_cv__min_samples_split': [2, 10],
+    'randomF_cv__min_samples_leaf': [1, 4],
+    'randomF_cv__class_weight': [weight_dict]
+}
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('randomF_cv', RandomForestClassifier(n_jobs=-1))  
+])
+    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
+                          n_jobs=-1)
+
+    grid_search.fit(X_train, y_train)
+    y_pred = grid_search.predict(X_test)
+    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
+    best_pipeline = grid_search.best_estimator_
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "randomF_cv"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_prob_distribution_companies(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
+    return model_result
+
+
+def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
+    unique_classes, counts = np.unique(y_train, return_counts=True)
+    class_priors = counts / counts.sum()
+    pipeline = Pipeline(steps=[
+    ('preprocessor', preproc),
+    ('Naive_Bayes', GaussianNB(priors=class_priors))  
+])
+    pipeline.fit(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
+    
+    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
+    model = "Naive_Bayes"
+    result = pd.DataFrame({"Model" : [model],
+                       "Accuracy" : [accuracy_score(y_test, y_pred)],
+                       "Recall" : [recall_score(y_test, y_pred)],
+                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
+                       "AUC" : [auc(fpr, tpr)]}
+                       )
+    model_result = pd.concat([model_result, result])
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    
+    draw_confusion_matrix(y_test, y_pred, model)
+    draw_roc_curve(X_test, y_pred_prob, model)
+    draw_prob_distribution(y_pred_prob, model)
+    draw_calibration_curve(X_test, y_pred_prob, model)
+    save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
+    return model_result
--- a/utils_segmentation.py
+++ b/utils_segmentation.py
@ -0,0 +1,27 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+import pickle
+import warnings
+
+
+def load_model(type_of_activity, model):
+    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
+    filename = model + '.pkl'
+    file_path = BUCKET + filename
+    with fs.open(file_path, mode="rb") as f:
+        model_bytes = f.read()
+
+    model = pickle.loads(model_bytes)
+    return model
+
+
+def load_test_file(type_of_activity):
+    file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
+    with fs.open(file_path_test, mode="rb") as file_in:
+        dataset_test = pd.read_csv(file_in, sep=",")
+    return dataset_test
+
--- a/utils_stat_desc.py
+++ b/utils_stat_desc.py
@ -0,0 +1,438 @@
+import pandas as pd
+import os
+import s3fs
+import io
+import warnings
+from datetime import date, timedelta, datetime
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import seaborn as sns
+
+
+def load_files(nb_compagnie):
+    customer = pd.DataFrame()
+    campaigns_brut = pd.DataFrame()
+    campaigns_kpi = pd.DataFrame()
+    products = pd.DataFrame()
+    tickets = pd.DataFrame()
+    targets = pd.DataFrame()
+    
+    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
+    for directory_path in nb_compagnie:
+        df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
+        df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
+        df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+        df_target_information = display_input_databases(directory_path, file_name = "target_information")
+        
+        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC')) 
+        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
+        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
+        df_target_KPI = targets_KPI(df_target = df_target_information)
+
+        # Merge and 
+        df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
+        targets_columns = list(df_target_KPI.columns)
+        targets_columns.remove('customer_id')
+        df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
+        
+    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
+        df_tickets_kpi["number_company"]=int(directory_path)
+        df_campaigns_brut["number_company"]=int(directory_path)
+        df_campaigns_kpi["number_company"]=int(directory_path)
+        df_customerplus_clean["number_company"]=int(directory_path)
+        df_target_information["number_company"]=int(directory_path)
+        df_target_KPI["number_company"]=int(directory_path)
+    
+    # Traitement des index
+        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
+        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
+        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
+        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
+        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
+<<<<<<< HEAD
+        
+    # Remove companies' outliers
+        df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
+    # harmonize set of customers across databases
+        customer_id = df_tickets_kpi['customer_id'].to_list()
+        for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
+            dataset = dataset[dataset['customer_id'].isin(customer_id)]
+        
+=======
+        df_target_KPI["customer_id"]= directory_path + '_' +  df_target_KPI['customer_id'].astype('str')
+
+    
+>>>>>>> main
+    # Concaténation
+        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
+        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
+        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
+        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
+        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
+        targets = pd.concat([targets, df_target_KPI], ignore_index=True)
+
+    return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
+
+
+def remove_outlier_total_amount(tickets):
+    Q1 = tickets['total_amount'].quantile(0.25)
+    Q3 = tickets['total_amount'].quantile(0.75)
+    IQR = Q3 - Q1
+    upper =  Q3 +1.5*IQR
+    outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
+    tickets = tickets[~tickets['customer_id'].isin(outliers)]
+    return tickets
+    
+
+def save_file_s3(File_name, type_of_activity):
+    image_buffer = io.BytesIO()
+    plt.savefig(image_buffer, format='png')
+    image_buffer.seek(0)
+    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
+    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
+    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
+        s3_file.write(image_buffer.read())
+    plt.close()
+
+
+def outlier_detection(tickets, company_list, show_diagram=False):
+
+    outlier_list = list()
+    
+    for company in company_list:
+        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
+        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
+        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
+        
+        total_amount_share_index = total_amount_share.set_index('customer_id')
+        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
+        #print('df circulaire : ', df_circulaire.head())
+        top = df_circulaire[:1]
+        #print('top : ', top)
+        outlier_list.append(top.index[0])
+        rest = df_circulaire[1:]
+    
+        rest_sum = rest.sum()
+        
+        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
+        
+        if show_diagram:
+            plt.figure(figsize=(3, 3))
+            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
+            plt.axis('equal')
+            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
+            plt.show()
+    return outlier_list
+
+
+def valid_customer_detection(products, campaigns_brut):
+    products_valid = products[products['purchase_date']>="2021-05-01"]
+    consumer_valid_product = products_valid['customer_id'].to_list()
+
+    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
+    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
+
+    consumer_valid = consumer_valid_product + consumer_valid_campaigns
+    return consumer_valid 
+
+
+def identify_purchase_during_target_periode(products):
+    products_target_period = products[(products['purchase_date']>="2022-11-01")
+    & (products['purchase_date']<="2023-11-01")]
+    customer_target_period = products_target_period['customer_id'].to_list()
+    return customer_target_period
+
+    
+def remove_elements(lst, elements_to_remove):
+    return ''.join([x for x in lst if x not in elements_to_remove])
+    
+
+def compute_nb_clients(customer, type_of_activity):
+    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
+    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
+
+    plt.xlabel('Company')
+    plt.ylabel("Number of clients (thousands)")
+    plt.title(f"Number of clients Across {type_of_activity} Companies")
+    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
+    plt.show()
+    save_file_s3("nb_clients_", type_of_activity)
+
+
+def maximum_price_paid(customer, type_of_activity):
+    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
+    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Maximal price of a ticket Prix")
+    plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
+    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
+    plt.show()
+    save_file_s3("Maximal_price_", type_of_activity)
+
+
+def target_proportion(customer, type_of_activity):
+    df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
+                                                 'customer_id' : 'nunique'}).reset_index()
+    df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
+    plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
+    plt.xlabel('Company Number')
+    plt.ylabel('Share (%)')
+    plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
+    plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
+    plt.show()
+    save_file_s3("share_target_", type_of_activity)
+
+
+def mailing_consent(customer, type_of_activity):
+    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
+    mailing_consent["opt_in"] *= 100
+    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
+
+    plt.xlabel('Company Number')
+    plt.ylabel('Mailing Consent (%)')
+    plt.title(f'Consent of mailing Across {type_of_activity} Companies')
+    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
+    plt.show()
+    save_file_s3("mailing_consent_", type_of_activity)
+
+
+def mailing_consent_by_target(customer):
+    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
+    # Création du barplot groupé
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    categories = df_graph["number_company"].unique()
+    bar_width = 0.35
+    bar_positions = np.arange(len(categories))
+    
+    # Grouper les données par label et créer les barres groupées
+    for label in df_graph["has_purchased_target_period"].unique():
+        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
+        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
+    
+        label_printed = "Purchase" if label else "No purchase"
+        ax.bar(bar_positions, values, bar_width, label=label_printed)
+    
+        # Mise à jour des positions des barres pour le prochain groupe
+        bar_positions = [pos + bar_width for pos in bar_positions]
+    
+    # Ajout des étiquettes, de la légende, etc.
+    ax.set_xlabel('Company Number')
+    ax.set_ylabel('Mailing Consent (%)')
+    ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
+    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
+    ax.set_xticklabels(categories)
+    ax.legend()
+    
+    # Affichage du plot
+    plt.show()
+    save_file_s3("mailing_consent_target_", type_of_activity)
+
+
+def gender_bar(customer, type_of_activity):
+    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
+    
+    company_genders["gender_male"] *= 100
+    company_genders["gender_female"] *= 100
+    company_genders["gender_other"] *= 100
+    
+    # Création du barplot
+    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
+    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
+            bottom = company_genders["gender_male"], label = "Female")
+    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
+            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Frequency (%)")
+    plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
+    plt.legend()
+    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
+    plt.show()
+    save_file_s3("gender_bar_", type_of_activity)
+    
+
+def country_bar(customer, type_of_activity):
+    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
+    company_country_fr["country_fr"] *= 100
+    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of French Customer (%)")
+    plt.title(f"Share of French Customer Across {type_of_activity} Companies")
+    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
+    plt.show()
+    save_file_s3("country_bar_", type_of_activity)
+
+
+def lazy_customer_plot(campaigns_kpi, type_of_activity):
+    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
+    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
+    
+    plt.xlabel('Company Number')
+    plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies")
+    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
+    plt.show()
+    save_file_s3("lazy_customer_", type_of_activity)
+
+
+def campaigns_effectiveness(customer, type_of_activity):
+
+    campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    categories = campaigns_effectiveness["number_company"].unique()
+    bar_width = 0.35
+    bar_positions = np.arange(len(categories))
+    
+    # Grouper les données par label et créer les barres groupées
+    for label in campaigns_effectiveness["has_purchased_target_period"].unique():
+        label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
+        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
+    
+        label_printed = "Purchase" if label else "No purchase"
+        ax.bar(bar_positions, values, bar_width, label=label_printed)
+    
+        # Mise à jour des positions des barres pour le prochain groupe
+        bar_positions = [pos + bar_width for pos in bar_positions]
+    
+    # Ajout des étiquettes, de la légende, etc.
+    ax.set_xlabel('Company Number')
+    ax.set_ylabel('Share of Consent (%)')
+    ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class  ({type_of_activity} companies)")
+    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
+    ax.set_xticklabels(categories)
+    ax.legend()
+    plt.show()
+    save_file_s3("campaigns_effectiveness_", type_of_activity)
+
+
+def sale_dynamics(products, campaigns_brut, type_of_activity):
+    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
+    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
+    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
+    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
+    
+    # Mois du premier mails
+    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
+    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
+    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
+    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
+    
+    # Fusion 
+    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
+                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
+    
+    # Mois à partir duquel le client est considere comme connu
+    
+    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
+    
+    # Nombre de commande par mois
+    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
+    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
+    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
+    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
+    
+    # Nombre de commande par mois par type de client
+    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
+    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
+    
+    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
+    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
+    
+    # Graphique en nombre de commande
+    purchases_graph = nb_purchases_graph
+    
+    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
+    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
+    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
+    
+    
+    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
+    
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
+    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
+            bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
+    
+    
+    # commande pr afficher slt
+    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
+    
+    plt.xlabel('Month')
+    plt.ylabel("Number of Sales")
+    plt.title(f"Number of Sales Across {type_of_activity} Companies")
+    plt.legend()
+    plt.show()
+    save_file_s3("sale_dynamics_", type_of_activity)
+
+
+def tickets_internet(tickets, type_of_activity):
+    nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
+    nb_tickets_internet['prop_purchases_internet'] *=100
+    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["prop_purchases_internet"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of Purchases Bought Online (%)")
+    plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
+    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
+    plt.show()
+    save_file_s3("tickets_internet_", type_of_activity)
+    
+
+def already_bought_online(tickets, type_of_activity):
+    nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
+                                                                        'customer_id' : 'nunique'}
+                                                                        ).reset_index())
+    nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
+
+    plt.bar(nb_consumers_online["number_company"],  nb_consumers_online["Share_consumers_internet"])
+    
+    plt.xlabel('Company Number')
+    plt.ylabel("Share of Customer who Bought Online at least once (%)")
+    plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
+    plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
+    plt.show()
+    save_file_s3("First_buy_internet_", type_of_activity)
+    
+
+def box_plot_price_tickets(tickets, type_of_activity):
+    price_tickets = tickets[(tickets['total_amount'] > 0)]
+    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
+    plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
+    plt.show()
+    save_file_s3("box_plot_price_tickets_", type_of_activity)
+
+def target_description(targets, type_of_activity):
+
+    describe_target = targets.groupby('number_company').agg(
+        prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
+        prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
+        prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
+        prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
+        prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
+        prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
+        prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
+        prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
+
+    plot = describe_target.plot.bar()
+    
+    # Adding a title
+    plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
+    
+    # Adding labels for x and y axes
+    plot.set_xlabel("Company Number")
+    plot.set_ylabel("Target Proportion")
+
+    plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
+
+    
+    # Adding a legend
+    plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
+
+    save_file_s3("target_category_proportion_", type_of_activity)
+
+    
+