From df4c28bdd8214129608f94c1da97b4e57ec0874f Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 4 Apr 2024 08:39:43 +0000 Subject: [PATCH] add function description --- utils_ml.py | 17 ++++++++++++++++- utils_stat_desc.py | 24 +++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/utils_ml.py b/utils_ml.py index 4dbdbc9..b276600 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -49,6 +49,9 @@ def load_train_test(type_of_activity, type_of_model): def save_file_s3(File_name, type_of_activity, type_of_model, model): + """ + save plot into s3 storage + """ image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) @@ -60,6 +63,9 @@ def save_file_s3(File_name, type_of_activity, type_of_model, model): def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False): + """ + save result into s3 storage + """ if model_path: FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv' else: @@ -69,6 +75,9 @@ def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, m def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier): + """ + save model into pickle file + """ model_bytes = pickle.dumps(classifier) FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl' with fs.open(FILE_PATH_OUT_S3, 'wb') as f: @@ -88,6 +97,9 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): def features_target_split(dataset_train, dataset_test): + """ + return train and test set + """ features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open', 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', @@ -105,7 +117,10 @@ def features_target_split(dataset_train, dataset_test): def preprocess(type_of_model, type_of_activity): - + """ + preprocess variables before running machine learning pipeline + """ + numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', diff --git a/utils_stat_desc.py b/utils_stat_desc.py index 2bd64c0..90bfcf8 100644 --- a/utils_stat_desc.py +++ b/utils_stat_desc.py @@ -1,6 +1,10 @@ def load_files(nb_compagnie): + """ + load and preprocess dataframes + """ + customer = pd.DataFrame() campaigns_brut = pd.DataFrame() campaigns_kpi = pd.DataFrame() @@ -8,7 +12,6 @@ def load_files(nb_compagnie): tickets = pd.DataFrame() targets = pd.DataFrame() - # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle for directory_path in nb_compagnie: df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned") df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) @@ -26,7 +29,7 @@ def load_files(nb_compagnie): targets_columns.remove('customer_id') df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0) - # creation de la colonne Number compagnie, qui permettra d'agréger les résultats + # Create company identifier df_tickets_kpi["number_company"]=int(directory_path) df_campaigns_brut["number_company"]=int(directory_path) df_campaigns_kpi["number_company"]=int(directory_path) @@ -34,7 +37,7 @@ def load_files(nb_compagnie): df_target_information["number_company"]=int(directory_path) df_target_KPI["number_company"]=int(directory_path) - # Traitement des index + # Clean index df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str') df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str') df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') @@ -51,7 +54,7 @@ def load_files(nb_compagnie): df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str') - # Concaténation + # Concatenation customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) @@ -73,6 +76,9 @@ def remove_outlier_total_amount(tickets): def save_file_s3(File_name, type_of_activity): + """ + save plots into s3 storage + """ image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150) image_buffer.seek(0) @@ -84,7 +90,9 @@ def save_file_s3(File_name, type_of_activity): def outlier_detection(tickets, company_list, show_diagram=False): - + """ + detect anonymous customers + """ outlier_list = list() for company in company_list: @@ -114,6 +122,9 @@ def outlier_detection(tickets, company_list, show_diagram=False): def valid_customer_detection(products, campaigns_brut): + """ + identify customer that are in our time perimeter + """ products_valid = products[products['purchase_date']>="2021-05-01"] consumer_valid_product = products_valid['customer_id'].to_list() @@ -125,6 +136,9 @@ def valid_customer_detection(products, campaigns_brut): def identify_purchase_during_target_periode(products): + """ + identify customer who purchased ticket during the target period + """ products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")] customer_target_period = products_target_period['customer_id'].to_list()