Merge pull request 'generalization' (#9) from generalization into main

Reviewed-on: #9
2024-03-20 21:26:04 +01:00 · 2024-03-20 21:26:04 +01:00 · a0256c551b
commit a0256c551b
parent ef23181a05 5cd1bcc222
10 changed files with 1635 additions and 8378 deletions
--- a/0_4_Generate_stat_desc.py
+++ b/0_4_Generate_stat_desc.py
@ -0,0 +1,70 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 import warnings
 # Ignore warning
 warnings.filterwarnings('ignore')
 exec(open('0_KPI_functions.py').read())
 exec(open('utils_stat_desc.py').read())
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 companies = {'musee' : ['1', '2', '3', '4'], # , '101'
            'sport': ['5', '6', '7', '8', '9'],
            'musique' : ['10', '11', '12', '13', '14']}
 type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 list_of_comp = companies[type_of_activity] 
 # Load files
 customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
 # Identify anonymous customer for each company and remove them from our datasets
 outlier_list = outlier_detection(tickets, list_of_comp)
 # Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 customer_valid_list = valid_customer_detection(products, campaigns_brut)
 databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 for dataset in databases:
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
    #print(f'shape of {dataset} : ', dataset.shape)
 # Identify customer who bought during the period of y
 customer_target_period = identify_purchase_during_target_periode(products)
 customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 # Generate graph and automatically saved them in the bucket
 compute_nb_clients(customer, type_of_activity)
 maximum_price_paid(customer, type_of_activity)
 mailing_consent(customer, type_of_activity)
 mailing_consent_by_target(customer)
 gender_bar(customer, type_of_activity)
 country_bar(customer, type_of_activity)
 lazy_customer_plot(campaigns_kpi, type_of_activity)
 campaigns_effectiveness(customer, type_of_activity)
 sale_dynamics(products, campaigns_brut, type_of_activity)
 tickets_internet(tickets, type_of_activity)
 already_bought_online(tickets, type_of_activity)
 box_plot_price_tickets(tickets, type_of_activity)
--- a/0_5_Machine_Learning.py
+++ b/0_5_Machine_Learning.py
@ -0,0 +1,103 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 from sklearn.utils import class_weight
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.calibration import calibration_curve
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 import pickle
 import warnings
 exec(open('utils_ml.py').read())
 warnings.filterwarnings('ignore')
 warnings.filterwarnings("ignore", category=ConvergenceWarning)
 warnings.filterwarnings("ignore", category=DataConversionWarning)
 # choose the type of companies for which you want to run the pipeline
 type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 # load train and test set
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 dataset_train, dataset_test = load_train_test(type_of_activity )
 X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
 print("Shape train : ", X_train.shape)
 print("Shape test : ", X_test.shape)
 # processing
 weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
                                            y = y_train['y_has_purchased'])
 weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
 numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
 numeric_transformer = Pipeline(steps=[
    #("imputer", SimpleImputer(strategy="mean")),  
    ("scaler", StandardScaler()) 
 ])
 categorical_features = ['opt_in']  
 # Transformer for the categorical features
 categorical_transformer = Pipeline(steps=[
    #("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
 ])
 preproc = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
 )
 # Object for storing results
 model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
 # Naive Bayes
 model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
 print("Naive Bayes : Done")
 # Logistic Regression
 model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
 print("Logistic : Done")
 """
 model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
 print("Logistic CV : Done")
 # Random Forest
 model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
 print("Random Forest : Done")
 model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
 print("Random Forest CV: Done")
 """
 # Save result
 save_result_set_s3(model_result , "resultat", type_of_activity)
--- a/0_6_Segmentation.py
+++ b/0_6_Segmentation.py
@ -0,0 +1,40 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 import pickle
 import warnings
 exec(open('utils_segmentation.py').read())
 warnings.filterwarnings('ignore')
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # choose the type of companies for which you want to run the pipeline
 type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 # load test set
 dataset_test = load_test_file(type_of_activity)
 # Load Model 
 model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
 # Processing
 X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
 y_test = dataset_test[['y_has_purchased']]
 # Prediction
 y_pred_prob = model.predict_proba(X_test)[:, 1]
 # Add probability to dataset_test
 dataset_test['Probability_to_buy'] = y_pred_prob
 print('probability added to dataset_test')
 print(dataset_test.head())
--- a/Descriptive_statistics/debug.ipynb
+++ b/Descriptive_statistics/debug.ipynb
--- a/Descriptive_statistics/generate_stat_desc.py
+++ b/Descriptive_statistics/generate_stat_desc.py
@ -0,0 +1,68 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 import warnings
 # Ignore warning
 warnings.filterwarnings('ignore')
 exec(open('../0_KPI_functions.py').read())
 exec(open('plot.py').read())
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 companies = {'musee' : ['1', '2', '3', '4'], # , '101'
            'sport': ['5'],
            'musique' : ['10', '11', '12', '13', '14']}
 type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 list_of_comp = companies[type_of_activity] 
 # Load files
 customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
 # Identify anonymous customer for each company and remove them from our datasets
 outlier_list = outlier_detection(tickets, list_of_comp)
 # Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 customer_valid_list = valid_customer_detection(products, campaigns_brut)
 databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 for dataset in databases:
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
    #print(f'shape of {dataset} : ', dataset.shape)
 # Identify customer who bought during the period of y
 customer_target_period = identify_purchase_during_target_periode(products)
 customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 # Generate graph and automatically saved them in the bucket
 compute_nb_clients(customer, type_of_activity)
 maximum_price_paid(customer, type_of_activity)
 mailing_consent(customer, type_of_activity)
 mailing_consent_by_target(customer)
 gender_bar(customer, type_of_activity)
 country_bar(customer, type_of_activity)
 lazy_customer_plot(campaigns_kpi, type_of_activity)
 #campaigns_effectiveness(customer, type_of_activity)
 sale_dynamics(products, campaigns_brut, type_of_activity)
 tickets_internet(tickets, type_of_activity)
 box_plot_price_tickets(tickets, type_of_activity)
--- a/Descriptive_statistics/plot.py
+++ b/Descriptive_statistics/plot.py
@ -0,0 +1,328 @@
 import pandas as pd
 import os
 import s3fs
 import io
 import warnings
 from datetime import date, timedelta, datetime
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 import seaborn as sns
 def load_files(nb_compagnie):
    customer = pd.DataFrame()
    campaigns_brut = pd.DataFrame()
    campaigns_kpi = pd.DataFrame()
    products = pd.DataFrame()
    tickets = pd.DataFrame()
    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
    for directory_path in nb_compagnie:
        df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
        df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
        df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
        df_target_information = display_databases(directory_path, file_name = "target_information")
        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) 
        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
        df_tickets_kpi["number_company"]=int(directory_path)
        df_campaigns_brut["number_company"]=int(directory_path)
        df_campaigns_kpi["number_company"]=int(directory_path)
        df_customerplus_clean["number_company"]=int(directory_path)
        df_target_information["number_company"]=int(directory_path)
    # Traitement des index
        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
    # Concaténation
        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
    return customer, campaigns_kpi, campaigns_brut, tickets, products
 def save_file_s3(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()
 def outlier_detection(tickets, company_list, show_diagram=False):
    outlier_list = list()
    for company in company_list:
        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
        total_amount_share_index = total_amount_share.set_index('customer_id')
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
        #print('df circulaire : ', df_circulaire.head())
        top = df_circulaire[:1]
        #print('top : ', top)
        outlier_list.append(top.index[0])
        rest = df_circulaire[1:]
        rest_sum = rest.sum()
        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
        if show_diagram:
            plt.figure(figsize=(3, 3))
            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
            plt.axis('equal')
            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
            plt.show()
    return outlier_list
 def valid_customer_detection(products, campaigns_brut):
    products_valid = products[products['purchase_date']>="2021-05-01"]
    consumer_valid_product = products_valid['customer_id'].to_list()
    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
    consumer_valid = consumer_valid_product + consumer_valid_campaigns
    return consumer_valid 
 def identify_purchase_during_target_periode(products):
    products_target_period = products[(products['purchase_date']>="2022-11-01")
    & (products['purchase_date']<="2023-11-01")]
    customer_target_period = products_target_period['customer_id'].to_list()
    return customer_target_period
 def remove_elements(lst, elements_to_remove):
    return ''.join([x for x in lst if x not in elements_to_remove])
 def compute_nb_clients(customer, type_of_activity):
    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
    plt.xlabel('Company')
    plt.ylabel("Number of clients (thousands)")
    plt.title(f"Number of clients for {type_of_activity}")
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
    plt.show()
    save_file_s3("nb_clients_", type_of_activity)
 def maximum_price_paid(customer, type_of_activity):
    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
    plt.xlabel('Company')
    plt.ylabel("Maximal price of a ticket Prix")
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
    plt.show()
    save_file_s3("Maximal_price_", type_of_activity)
 def mailing_consent(customer, type_of_activity):
    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
    plt.xlabel('Company')
    plt.ylabel('Consent')
    plt.title(f'Consent of mailing for {type_of_activity}')
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
    plt.show()
    save_file_s3("mailing_consent_", type_of_activity)
 def mailing_consent_by_target(customer):
    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
    # Création du barplot groupé
    fig, ax = plt.subplots(figsize=(10, 6))
    categories = df_graph["number_company"].unique()
    bar_width = 0.35
    bar_positions = np.arange(len(categories))
    # Grouper les données par label et créer les barres groupées
    for label in df_graph["has_purchased_target_period"].unique():
        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
        label_printed = "purchased" if label else "no purchase"
        ax.bar(bar_positions, values, bar_width, label=label_printed)
        # Mise à jour des positions des barres pour le prochain groupe
        bar_positions = [pos + bar_width for pos in bar_positions]
    # Ajout des étiquettes, de la légende, etc.
    ax.set_xlabel('Company')
    ax.set_ylabel('Consent')
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
    ax.set_xticklabels(categories)
    ax.legend()
    # Affichage du plot
    plt.show()
    save_file_s3("mailing_consent_target_", type_of_activity)
 def gender_bar(customer, type_of_activity):
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
    # Création du barplot
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
            bottom = company_genders["gender_male"], label = "Femme")
    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
    plt.xlabel('Company')
    plt.ylabel("Gender")
    plt.title(f"Gender of Customer for {type_of_activity}")
    plt.legend()
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
    plt.show()
    save_file_s3("gender_bar_", type_of_activity)
 def country_bar(customer, type_of_activity):
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
    plt.xlabel('Company')
    plt.ylabel("Share of French Customer")
    plt.title(f"Share of French Customer for {type_of_activity}")
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
    plt.show()
    save_file_s3("country_bar_", type_of_activity)
 def lazy_customer_plot(campaigns_kpi, type_of_activity):
    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
    plt.xlabel('Company')
    plt.ylabel("Share of Customers who did not open mail")
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
    plt.show()
    save_file_s3("lazy_customer_", type_of_activity)
 def campaigns_effectiveness(customer, type_of_activity):
    campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
    plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
    plt.xlabel('Company')
    plt.ylabel("Number of Customers (thousands)")
    plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
    plt.legend()
    plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
    plt.show()
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 def sale_dynamics(products, campaigns_brut, type_of_activity):
    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
    # Mois du premier mails
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
    # Fusion 
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
    # Mois à partir duquel le client est considere comme connu
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
    # Nombre de commande par mois
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
    # Nombre de commande par mois par type de client
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
    # Graphique en nombre de commande
    purchases_graph = nb_purchases_graph
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
    # commande pr afficher slt
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
    plt.xlabel('Month')
    plt.ylabel("Number of Sales")
    plt.title(f"Number of Sales for {type_of_activity}")
    plt.legend()
    plt.show()
    save_file_s3("sale_dynamics_", type_of_activity)
 def tickets_internet(tickets, type_of_activity):
    nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
    nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["Share_ticket_internet"])
    plt.xlabel('Company')
    plt.ylabel("Share of Tickets Bought Online")
    plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
    plt.show()
    save_file_s3("tickets_internet_", type_of_activity)
 def box_plot_price_tickets(tickets, type_of_activity):
    price_tickets = tickets[(tickets['total_amount'] > 0)]
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
    plt.title(f"Box plot of price tickets for {type_of_activity}")
    plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
    plt.show()
    save_file_s3("box_plot_price_tickets_", type_of_activity)
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
--- a/utils_ml.py
+++ b/utils_ml.py
@ -0,0 +1,358 @@
 import pandas as pd
 import numpy as np
 import os
 import s3fs
 import re
 import io
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 from sklearn.utils import class_weight
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.calibration import calibration_curve
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 import pickle
 import warnings
 def load_train_test(type_of_activity):
    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    return dataset_train, dataset_test
 def save_file_s3(File_name, type_of_activity, model):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()
 def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):
    if model_path:
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'
    else:
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        result_set.to_csv(file_out, index = False)
 def save_model_s3(File_name, type_of_activity, model, classifier):
    model_bytes = pickle.dumps(classifier)
    FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
        f.write(model_bytes)
 def compute_recall(group):
    return recall_score(group['y_has_purchased'], group['prediction'])
 def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
    test = dataset_test.copy()
    test['prediction'] = y_pred
    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
    recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)
 def features_target_split(dataset_train, dataset_test):
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
    X_train = dataset_train[features_l]
    y_train = dataset_train[['y_has_purchased']]
    X_test = dataset_test[features_l]
    y_test = dataset_test[['y_has_purchased']]
    return X_train, X_test, y_train, y_test
 def draw_confusion_matrix(y_test, y_pred, model):
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    save_file_s3("Confusion_matrix_", type_of_activity, model)
 def draw_roc_curve(X_test, y_pred_prob, model):
    # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
    # Calcul de l'aire sous la courbe ROC (AUC)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize = (14, 8))
    plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
    plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
    plt.grid(color='gray', linestyle='--', linewidth=0.5)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve", size=18)
    plt.legend(loc="lower right")
    plt.show()
    save_file_s3("Roc_curve_", type_of_activity, model)
 def draw_calibration_curve(X_test, y_pred_prob, model):
    frac_pos, mean_pred = calibration_curve(y_test,  y_pred_prob, n_bins=10)
    # Plot the calibration curve
    plt.plot(mean_pred, frac_pos, 's-', label=model)
    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positive predictions')
    plt.title("Calibration Curve")
    plt.legend()
    plt.show()
    save_file_s3("Calib_curve_", type_of_activity, model)
 def draw_features_importance(pipeline, model, randomF = False):
    if randomF:
        coefficients = pipeline.named_steps[model].feature_importances_
    else: 
        coefficients = pipeline.named_steps[model].coef_[0]
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    # Tracer l'importance des caractéristiques
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, coefficients, color='skyblue')
    plt.xlabel("Features' Importance")
    plt.ylabel('Caractéristiques')
    plt.title("Features' Importance")
    plt.grid(True)
    plt.show()
    save_file_s3("Features_", type_of_activity, model)
 def draw_prob_distribution(y_pred_prob, model):
    plt.figure(figsize=(8, 6))
    plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
    plt.xlim(0, 1)
    plt.ylim(0, None)
    plt.title('Histogramme des probabilités pour la classe 1')
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    save_file_s3("prob_dist_", type_of_activity, model)
 def draw_prob_distribution_companies(y_pred_prob, model):
    test = dataset_test.copy()
    test['probability to buy'] = y_pred_prob
    test['company'] = test['customer_id'].str.split('_', expand=True)[0]
    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
             stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
    plt.xlim(0, 1)
    plt.ylim(0, None)
    plt.title('Histogram of probabilities for class 1 by company')
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    save_file_s3("prob_dist_companies_", type_of_activity, model)
 def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
                                  max_iter=5000, n_jobs=-1))  
 ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "LogisticRegression_Benchmark"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)
    return model_result
 def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
    y_train = y_train['y_has_purchased']
    param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
              'LogisticRegression_cv__penalty': ['l1', 'l2'],
               'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} 
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))  
 ])
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
                          n_jobs=-1)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
    best_pipeline = grid_search.best_estimator_
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "LogisticRegression_cv"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(best_pipeline, 'LogisticRegression_cv')
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)
    return model_result
 def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('randomF', RandomForestClassifier(class_weight = weight_dict,
                                  n_jobs=-1))  
 ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "randomF"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(pipeline, 'randomF', randomF=True)
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)
    return model_result
 def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
    y_train = y_train['y_has_purchased']
    param_grid = {
    'randomF_cv__n_estimators': [100, 300],
    'randomF_cv__max_features': ['sqrt', 'log2'],
    'randomF_cv__min_samples_split': [2, 10],
    'randomF_cv__min_samples_leaf': [1, 4],
    'randomF_cv__class_weight': [weight_dict]
 }
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('randomF_cv', RandomForestClassifier(n_jobs=-1))  
 ])
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
                          n_jobs=-1)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
    best_pipeline = grid_search.best_estimator_
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "randomF_cv"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
    return model_result
 def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
    unique_classes, counts = np.unique(y_train, return_counts=True)
    class_priors = counts / counts.sum()
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('Naive_Bayes', GaussianNB(priors=class_priors))  
 ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "Naive_Bayes"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_prob_distribution(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)
    return model_result
--- a/utils_segmentation.py
+++ b/utils_segmentation.py
@ -0,0 +1,27 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 import pickle
 import warnings
 def load_model(type_of_activity, model):
    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f:
        model_bytes = f.read()
    model = pickle.loads(model_bytes)
    return model
 def load_test_file(type_of_activity):
    file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
    with fs.open(file_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
    return dataset_test
--- a/utils_stat_desc.py
+++ b/utils_stat_desc.py
@ -0,0 +1,358 @@
 import pandas as pd
 import os
 import s3fs
 import io
 import warnings
 from datetime import date, timedelta, datetime
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 import seaborn as sns
 def load_files(nb_compagnie):
    customer = pd.DataFrame()
    campaigns_brut = pd.DataFrame()
    campaigns_kpi = pd.DataFrame()
    products = pd.DataFrame()
    tickets = pd.DataFrame()
    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
    for directory_path in nb_compagnie:
        df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
        df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
        df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
        df_target_information = display_input_databases(directory_path, file_name = "target_information")
        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC')) 
        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
        df_tickets_kpi["number_company"]=int(directory_path)
        df_campaigns_brut["number_company"]=int(directory_path)
        df_campaigns_kpi["number_company"]=int(directory_path)
        df_customerplus_clean["number_company"]=int(directory_path)
        df_target_information["number_company"]=int(directory_path)
    # Traitement des index
        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
    # Concaténation
        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
    return customer, campaigns_kpi, campaigns_brut, tickets, products
 def save_file_s3(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()
 def outlier_detection(tickets, company_list, show_diagram=False):
    outlier_list = list()
    for company in company_list:
        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
        total_amount_share_index = total_amount_share.set_index('customer_id')
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
        #print('df circulaire : ', df_circulaire.head())
        top = df_circulaire[:1]
        #print('top : ', top)
        outlier_list.append(top.index[0])
        rest = df_circulaire[1:]
        rest_sum = rest.sum()
        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
        if show_diagram:
            plt.figure(figsize=(3, 3))
            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
            plt.axis('equal')
            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
            plt.show()
    return outlier_list
 def valid_customer_detection(products, campaigns_brut):
    products_valid = products[products['purchase_date']>="2021-05-01"]
    consumer_valid_product = products_valid['customer_id'].to_list()
    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
    consumer_valid = consumer_valid_product + consumer_valid_campaigns
    return consumer_valid 
 def identify_purchase_during_target_periode(products):
    products_target_period = products[(products['purchase_date']>="2022-11-01")
    & (products['purchase_date']<="2023-11-01")]
    customer_target_period = products_target_period['customer_id'].to_list()
    return customer_target_period
 def remove_elements(lst, elements_to_remove):
    return ''.join([x for x in lst if x not in elements_to_remove])
 def compute_nb_clients(customer, type_of_activity):
    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
    plt.xlabel('Company')
    plt.ylabel("Number of clients (thousands)")
    plt.title(f"Number of clients for {type_of_activity}")
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
    plt.show()
    save_file_s3("nb_clients_", type_of_activity)
 def maximum_price_paid(customer, type_of_activity):
    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
    plt.xlabel('Company')
    plt.ylabel("Maximal price of a ticket Prix")
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
    plt.show()
    save_file_s3("Maximal_price_", type_of_activity)
 def mailing_consent(customer, type_of_activity):
    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
    plt.xlabel('Company')
    plt.ylabel('Consent')
    plt.title(f'Consent of mailing for {type_of_activity}')
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
    plt.show()
    save_file_s3("mailing_consent_", type_of_activity)
 def mailing_consent_by_target(customer):
    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
    # Création du barplot groupé
    fig, ax = plt.subplots(figsize=(10, 6))
    categories = df_graph["number_company"].unique()
    bar_width = 0.35
    bar_positions = np.arange(len(categories))
    # Grouper les données par label et créer les barres groupées
    for label in df_graph["has_purchased_target_period"].unique():
        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
        label_printed = "purchased" if label else "no purchase"
        ax.bar(bar_positions, values, bar_width, label=label_printed)
        # Mise à jour des positions des barres pour le prochain groupe
        bar_positions = [pos + bar_width for pos in bar_positions]
    # Ajout des étiquettes, de la légende, etc.
    ax.set_xlabel('Company')
    ax.set_ylabel('Consent')
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
    ax.set_xticklabels(categories)
    ax.legend()
    # Affichage du plot
    plt.show()
    save_file_s3("mailing_consent_target_", type_of_activity)
 def gender_bar(customer, type_of_activity):
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
    # Création du barplot
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
            bottom = company_genders["gender_male"], label = "Femme")
    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
    plt.xlabel('Company')
    plt.ylabel("Gender")
    plt.title(f"Gender of Customer for {type_of_activity}")
    plt.legend()
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
    plt.show()
    save_file_s3("gender_bar_", type_of_activity)
 def country_bar(customer, type_of_activity):
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
    plt.xlabel('Company')
    plt.ylabel("Share of French Customer")
    plt.title(f"Share of French Customer for {type_of_activity}")
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
    plt.show()
    save_file_s3("country_bar_", type_of_activity)
 def lazy_customer_plot(campaigns_kpi, type_of_activity):
    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
    plt.xlabel('Company')
    plt.ylabel("Share of Customers who did not open mail")
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
    plt.show()
    save_file_s3("lazy_customer_", type_of_activity)
 def campaigns_effectiveness(customer, type_of_activity):
    campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
    fig, ax = plt.subplots(figsize=(10, 6))
    categories = campaigns_effectiveness["number_company"].unique()
    bar_width = 0.35
    bar_positions = np.arange(len(categories))
    # Grouper les données par label et créer les barres groupées
    for label in campaigns_effectiveness["has_purchased_target_period"].unique():
        label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
        label_printed = "purchased" if label else "no purchase"
        ax.bar(bar_positions, values, bar_width, label=label_printed)
        # Mise à jour des positions des barres pour le prochain groupe
        bar_positions = [pos + bar_width for pos in bar_positions]
    # Ajout des étiquettes, de la légende, etc.
    ax.set_xlabel('Company')
    ax.set_ylabel('Consent')
    ax.set_title(f"Number of Customers who have consent to received mails for {type_of_activity} dependy on target")
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
    ax.set_xticklabels(categories)
    ax.legend()
    plt.show()
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 def sale_dynamics(products, campaigns_brut, type_of_activity):
    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
    # Mois du premier mails
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
    # Fusion 
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
    # Mois à partir duquel le client est considere comme connu
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
    # Nombre de commande par mois
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
    # Nombre de commande par mois par type de client
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
    # Graphique en nombre de commande
    purchases_graph = nb_purchases_graph
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
    # commande pr afficher slt
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
    plt.xlabel('Month')
    plt.ylabel("Number of Sales")
    plt.title(f"Number of Sales for {type_of_activity}")
    plt.legend()
    plt.show()
    save_file_s3("sale_dynamics_", type_of_activity)
 def tickets_internet(tickets, type_of_activity):
    nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["prop_purchases_internet"])
    plt.xlabel('Company')
    plt.ylabel("Share of Purchases Bought Online")
    plt.title(f"Share of Purchases Bought Online for {type_of_activity}")
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
    plt.show()
    save_file_s3("tickets_internet_", type_of_activity)
 def already_bought_online(tickets, type_of_activity):
    nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
                                                                        'customer_id' : 'nunique'}
                                                                        ).reset_index())
    nb_consumers_online["Share_consumers_internet"] = nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"]
    plt.bar(nb_consumers_online["number_company"],  nb_consumers_online["Share_consumers_internet"])
    plt.xlabel('Company')
    plt.ylabel("Share of Customer who Bought Online at least once")
    plt.title(f"Share of Customer who Bought Online at least once for {type_of_activity}")
    plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
    plt.show()
    save_file_s3("First_buy_internet_", type_of_activity)
 def box_plot_price_tickets(tickets, type_of_activity):
    price_tickets = tickets[(tickets['total_amount'] > 0)]
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
    plt.title(f"Box plot of price tickets for {type_of_activity}")
    plt.show()
    save_file_s3("box_plot_price_tickets_", type_of_activity)