From 9763dfe7f9b34eaff0c75f8d7ed72f6960bdd6cf Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 21 Mar 2024 07:10:10 +0000 Subject: [PATCH 1/7] add result by companies --- 0_5_Machine_Learning.py | 4 ++-- utils_ml.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 6c9ca62..f6c162b 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -89,7 +89,7 @@ print("Naive Bayes : Done") # Logistic Regression model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) print("Logistic : Done") -""" + model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) print("Logistic CV : Done") @@ -98,6 +98,6 @@ model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, mode print("Random Forest : Done") model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) print("Random Forest CV: Done") -""" + # Save result save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index e801964..17ac85a 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -78,7 +78,7 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): test = dataset_test.copy() test['prediction'] = y_pred test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] - recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score') + recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) @@ -207,7 +207,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -244,7 +244,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -276,7 +276,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -317,7 +317,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -325,7 +325,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_cv', type_of_activity, model, gridsearch) + save_model_s3('randomF_cv', type_of_activity, model, grid_search) return model_result @@ -350,6 +350,8 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model) From 089a8fd3d6b8d5f641cb988cec87acf4e7296a10 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 21 Mar 2024 08:16:29 +0000 Subject: [PATCH 2/7] fix labels --- utils_stat_desc.py | 88 ++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/utils_stat_desc.py b/utils_stat_desc.py index 7eedd9c..6372c63 100644 --- a/utils_stat_desc.py +++ b/utils_stat_desc.py @@ -122,7 +122,7 @@ def compute_nb_clients(customer, type_of_activity): plt.xlabel('Company') plt.ylabel("Number of clients (thousands)") - plt.title(f"Number of clients for {type_of_activity}") + plt.title(f"Number of clients Across {type_of_activity} Companies") plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]]) plt.show() save_file_s3("nb_clients_", type_of_activity) @@ -132,9 +132,9 @@ def maximum_price_paid(customer, type_of_activity): company_max_price = customer.groupby("number_company")["max_price"].max().reset_index() plt.bar(company_max_price["number_company"], company_max_price["max_price"]) - plt.xlabel('Company') + plt.xlabel('Company Number') plt.ylabel("Maximal price of a ticket Prix") - plt.title(f"Maximal price of a ticket for {type_of_activity}") + plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies") plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]]) plt.show() save_file_s3("Maximal_price_", type_of_activity) @@ -142,12 +142,12 @@ def maximum_price_paid(customer, type_of_activity): def mailing_consent(customer, type_of_activity): mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() - + mailing_consent["opt_in"] *= 100 plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) - plt.xlabel('Company') - plt.ylabel('Consent') - plt.title(f'Consent of mailing for {type_of_activity}') + plt.xlabel('Company Number') + plt.ylabel('Mailing Consent (%)') + plt.title(f'Consent of mailing Across {type_of_activity} Companies') plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]]) plt.show() save_file_s3("mailing_consent_", type_of_activity) @@ -167,16 +167,16 @@ def mailing_consent_by_target(customer): label_data = df_graph[df_graph['has_purchased_target_period'] == label] values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] - label_printed = "purchased" if label else "no purchase" + label_printed = "Purchase" if label else "No purchase" ax.bar(bar_positions, values, bar_width, label=label_printed) # Mise à jour des positions des barres pour le prochain groupe bar_positions = [pos + bar_width for pos in bar_positions] # Ajout des étiquettes, de la légende, etc. - ax.set_xlabel('Company') - ax.set_ylabel('Consent') - ax.set_title(f'Consent of mailing according to target for {type_of_activity}') + ax.set_xlabel('Company Number') + ax.set_ylabel('Mailing Consent (%)') + ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies') ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() @@ -189,16 +189,20 @@ def mailing_consent_by_target(customer): def gender_bar(customer, type_of_activity): company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() - # Création du barplot - plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme") - plt.bar(company_genders["number_company"], company_genders["gender_female"], - bottom = company_genders["gender_male"], label = "Femme") - plt.bar(company_genders["number_company"], company_genders["gender_other"], - bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu") + company_genders["gender_male"] *= 100 + company_genders["gender_female"] *= 100 + company_genders["gender_other"] *= 100 - plt.xlabel('Company') - plt.ylabel("Gender") - plt.title(f"Gender of Customer for {type_of_activity}") + # Création du barplot + plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male") + plt.bar(company_genders["number_company"], company_genders["gender_female"], + bottom = company_genders["gender_male"], label = "Female") + plt.bar(company_genders["number_company"], company_genders["gender_other"], + bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown") + + plt.xlabel('Company Number') + plt.ylabel("Frequency (%)") + plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies") plt.legend() plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) plt.show() @@ -207,11 +211,12 @@ def gender_bar(customer, type_of_activity): def country_bar(customer, type_of_activity): company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() + company_country_fr["country_fr"] *= 100 plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) - plt.xlabel('Company') - plt.ylabel("Share of French Customer") - plt.title(f"Share of French Customer for {type_of_activity}") + plt.xlabel('Company Number') + plt.ylabel("Share of French Customer (%)") + plt.title(f"Share of French Customer Across {type_of_activity} Companies") plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]]) plt.show() save_file_s3("country_bar_", type_of_activity) @@ -221,9 +226,8 @@ def lazy_customer_plot(campaigns_kpi, type_of_activity): company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index() plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"]) - plt.xlabel('Company') - plt.ylabel("Share of Customers who did not open mail") - plt.title(f"Share of Customers who did not open mail for {type_of_activity}") + plt.xlabel('Company Number') + plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies") plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]]) plt.show() save_file_s3("lazy_customer_", type_of_activity) @@ -244,16 +248,16 @@ def campaigns_effectiveness(customer, type_of_activity): label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label] values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] - label_printed = "purchased" if label else "no purchase" + label_printed = "Purchase" if label else "No purchase" ax.bar(bar_positions, values, bar_width, label=label_printed) # Mise à jour des positions des barres pour le prochain groupe bar_positions = [pos + bar_width for pos in bar_positions] # Ajout des étiquettes, de la légende, etc. - ax.set_xlabel('Company') - ax.set_ylabel('Consent') - ax.set_title(f"Number of Customers who have consent to received mails for {type_of_activity} dependy on target") + ax.set_xlabel('Company Number') + ax.set_ylabel('Share of Consent (%)') + ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)") ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() @@ -304,9 +308,9 @@ def sale_dynamics(products, campaigns_brut, type_of_activity): merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old")) - plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client") + plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers") plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], - bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client") + bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers") # commande pr afficher slt @@ -314,7 +318,7 @@ def sale_dynamics(products, campaigns_brut, type_of_activity): plt.xlabel('Month') plt.ylabel("Number of Sales") - plt.title(f"Number of Sales for {type_of_activity}") + plt.title(f"Number of Sales Across {type_of_activity} Companies") plt.legend() plt.show() save_file_s3("sale_dynamics_", type_of_activity) @@ -322,12 +326,12 @@ def sale_dynamics(products, campaigns_brut, type_of_activity): def tickets_internet(tickets, type_of_activity): nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index() - + nb_tickets_internet['prop_purchases_internet'] *=100 plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"]) - plt.xlabel('Company') - plt.ylabel("Share of Purchases Bought Online") - plt.title(f"Share of Purchases Bought Online for {type_of_activity}") + plt.xlabel('Company Number') + plt.ylabel("Share of Purchases Bought Online (%)") + plt.title(f"Share of Online Purchases Across {type_of_activity} Companies") plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]]) plt.show() save_file_s3("tickets_internet_", type_of_activity) @@ -337,13 +341,13 @@ def already_bought_online(tickets, type_of_activity): nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum', 'customer_id' : 'nunique'} ).reset_index()) - nb_consumers_online["Share_consumers_internet"] = nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"] + nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100 plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"]) - plt.xlabel('Company') - plt.ylabel("Share of Customer who Bought Online at least once") - plt.title(f"Share of Customer who Bought Online at least once for {type_of_activity}") + plt.xlabel('Company Number') + plt.ylabel("Share of Customer who Bought Online at least once (%)") + plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies") plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]]) plt.show() save_file_s3("First_buy_internet_", type_of_activity) @@ -352,7 +356,7 @@ def already_bought_online(tickets, type_of_activity): def box_plot_price_tickets(tickets, type_of_activity): price_tickets = tickets[(tickets['total_amount'] > 0)] sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) - plt.title(f"Box plot of price tickets for {type_of_activity}") + plt.title(f"Box plot of price tickets Across {type_of_activity} Companies") plt.show() save_file_s3("box_plot_price_tickets_", type_of_activity) From 52fd738fe512c4c1f25ca81619d59c8c86886a0b Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 21 Mar 2024 10:47:40 +0000 Subject: [PATCH 3/7] fix errors --- 0_4_Generate_stat_desc.py | 4 +++- 0_5_Machine_Learning.py | 8 ++++---- utils_stat_desc.py | 32 +++++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/0_4_Generate_stat_desc.py b/0_4_Generate_stat_desc.py index c0821e0..160e568 100644 --- a/0_4_Generate_stat_desc.py +++ b/0_4_Generate_stat_desc.py @@ -47,7 +47,9 @@ customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin( # Generate graph and automatically saved them in the bucket compute_nb_clients(customer, type_of_activity) -maximum_price_paid(customer, type_of_activity) +#maximum_price_paid(customer, type_of_activity) + +target_proportion(customer, type_of_activity) mailing_consent(customer, type_of_activity) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index f6c162b..4e43afd 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -55,16 +55,16 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} -numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', - 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', - 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] +numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] numeric_transformer = Pipeline(steps=[ #("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler()) ]) -categorical_features = ['opt_in'] +categorical_features = ['opt_in', 'gender_male', 'gender_female'] # Transformer for the categorical features categorical_transformer = Pipeline(steps=[ diff --git a/utils_stat_desc.py b/utils_stat_desc.py index 6372c63..469569a 100644 --- a/utils_stat_desc.py +++ b/utils_stat_desc.py @@ -42,7 +42,14 @@ def load_files(nb_compagnie): df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') - + + # Remove companies' outliers + df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi) + # harmonize set of customers across databases + customer_id = df_tickets_kpi['customer_id'].to_list() + for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]: + dataset = dataset[dataset['customer_id'].isin(customer_id)] + # Concaténation customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) @@ -53,6 +60,16 @@ def load_files(nb_compagnie): return customer, campaigns_kpi, campaigns_brut, tickets, products +def remove_outlier_total_amount(tickets): + Q1 = tickets['total_amount'].quantile(0.25) + Q3 = tickets['total_amount'].quantile(0.75) + IQR = Q3 - Q1 + upper = Q3 +1.5*IQR + outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list() + tickets = tickets[~tickets['customer_id'].isin(outliers)] + return tickets + + def save_file_s3(File_name, type_of_activity): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') @@ -140,6 +157,19 @@ def maximum_price_paid(customer, type_of_activity): save_file_s3("Maximal_price_", type_of_activity) +def target_proportion(customer, type_of_activity): + df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum', + 'customer_id' : 'nunique'}).reset_index() + df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100 + plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"]) + plt.xlabel('Company Number') + plt.ylabel('Share (%)') + plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies') + plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]]) + plt.show() + save_file_s3("share_target_", type_of_activity) + + def mailing_consent(customer, type_of_activity): mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() mailing_consent["opt_in"] *= 100 From 133eb83e840c6258cddba4cbc877423700a24029 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Wed, 27 Mar 2024 14:08:40 +0000 Subject: [PATCH 4/7] add path premium --- 0_5_Machine_Learning.py | 31 +++------------- utils_ml.py | 82 ++++++++++++++++++++++++++++++----------- 2 files changed, 65 insertions(+), 48 deletions(-) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 4e43afd..1700766 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -34,13 +34,15 @@ warnings.filterwarnings("ignore", category=DataConversionWarning) # choose the type of companies for which you want to run the pipeline type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +# choose the type of model +type_of_model = input('Choisissez le type de model : basique ? premium ?') # load train and test set # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) -dataset_train, dataset_test = load_train_test(type_of_activity ) +dataset_train, dataset_test = load_train_test(type_of_activity) X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test) @@ -54,30 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} - -numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', - 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', - 'nb_campaigns', 'nb_campaigns_opened'] - -numeric_transformer = Pipeline(steps=[ - #("imputer", SimpleImputer(strategy="mean")), - ("scaler", StandardScaler()) -]) - -categorical_features = ['opt_in', 'gender_male', 'gender_female'] - -# Transformer for the categorical features -categorical_transformer = Pipeline(steps=[ - #("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent - ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) -]) - -preproc = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, categorical_features) - ] -) +preproc = preprocess(type_of_model) # Object for storing results model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) @@ -100,4 +79,4 @@ model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_resul print("Random Forest CV: Done") # Save result -save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index 17ac85a..767f7db 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -43,29 +43,29 @@ def load_train_test(type_of_activity): return dataset_train, dataset_test -def save_file_s3(File_name, type_of_activity, model): +def save_file_s3(File_name, type_of_activity, type_of_model, model): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) - FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() -def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): +def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False): if model_path: - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv' else: - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv' with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: result_set.to_csv(file_out, index = False) -def save_model_s3(File_name, type_of_activity, model, classifier): +def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier): model_bytes = pickle.dumps(classifier) - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl' with fs.open(FILE_PATH_OUT_S3, 'wb') as f: f.write(model_bytes) @@ -79,13 +79,13 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): test['prediction'] = y_pred test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') - save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) + save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True) def features_target_split(dataset_train, dataset_test): features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', - 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] + 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] @@ -94,6 +94,44 @@ def features_target_split(dataset_train, dataset_test): return X_train, X_test, y_train, y_test +def preprocess(type_of_model): + if type_of_model=='premium': + numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] + + binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] + categorical_features = ['opt_in'] + + else: + numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] + + binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] + categorical_features = ['opt_in'] + + numeric_transformer = Pipeline(steps=[ + ("scaler", StandardScaler()) + ]) + categorical_features = ['opt_in'] + categorical_transformer = Pipeline(steps=[ + ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) + ]) + + binary_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ]) + preproc = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ("bin", binary_transformer, binary_features) + ] + ) + return preproc + + def draw_confusion_matrix(y_test, y_pred, model): conf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) @@ -101,7 +139,7 @@ def draw_confusion_matrix(y_test, y_pred, model): plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() - save_file_s3("Confusion_matrix_", type_of_activity, model) + save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model) def draw_roc_curve(X_test, y_pred_prob, model): @@ -120,7 +158,7 @@ def draw_roc_curve(X_test, y_pred_prob, model): plt.title("ROC Curve", size=18) plt.legend(loc="lower right") plt.show() - save_file_s3("Roc_curve_", type_of_activity, model) + save_file_s3("Roc_curve_", type_of_activity, type_of_model, model) def draw_calibration_curve(X_test, y_pred_prob, model): @@ -134,7 +172,7 @@ def draw_calibration_curve(X_test, y_pred_prob, model): plt.title("Calibration Curve") plt.legend() plt.show() - save_file_s3("Calib_curve_", type_of_activity, model) + save_file_s3("Calib_curve_", type_of_activity, type_of_model, model) def draw_features_importance(pipeline, model, randomF = False): @@ -145,18 +183,18 @@ def draw_features_importance(pipeline, model, randomF = False): feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() # Tracer l'importance des caractéristiques - plt.figure(figsize=(10, 6)) + plt.figure(figsize=(12, 8)) plt.barh(feature_names, coefficients, color='skyblue') plt.xlabel("Features' Importance") plt.ylabel('Caractéristiques') plt.title("Features' Importance") plt.grid(True) plt.show() - save_file_s3("Features_", type_of_activity, model) + save_file_s3("Features_", type_of_activity, type_of_model, model) def draw_prob_distribution(y_pred_prob, model): - plt.figure(figsize=(8, 6)) + plt.figure(figsize=(10, 8)) plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) plt.xlim(0, 1) @@ -167,7 +205,7 @@ def draw_prob_distribution(y_pred_prob, model): plt.ylabel('Frequency') plt.grid(True) plt.show() - save_file_s3("prob_dist_", type_of_activity, model) + save_file_s3("prob_dist_", type_of_activity, type_of_model, model) def draw_prob_distribution_companies(y_pred_prob, model): @@ -183,7 +221,7 @@ def draw_prob_distribution_companies(y_pred_prob, model): plt.ylabel('Frequency') plt.grid(True) plt.show() - save_file_s3("prob_dist_companies_", type_of_activity, model) + save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model) @@ -215,7 +253,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline) + save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result @@ -252,7 +290,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search) + save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search) return model_result @@ -284,7 +322,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline) + save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result @@ -325,7 +363,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_cv', type_of_activity, model, grid_search) + save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search) return model_result @@ -356,5 +394,5 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline) + save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result \ No newline at end of file From adc62dd0560ab5fec694d5d8b4754f28cda5b0d0 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 28 Mar 2024 07:37:10 +0000 Subject: [PATCH 5/7] save at different steps --- 0_5_Machine_Learning.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 1700766..79ee8c1 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -63,6 +63,7 @@ model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", # Naive Bayes model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result) +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) print("Naive Bayes : Done") # Logistic Regression @@ -70,12 +71,16 @@ model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model print("Logistic : Done") model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) print("Logistic CV : Done") # Random Forest model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result) +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) print("Random Forest : Done") + model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) print("Random Forest CV: Done") # Save result From ebdbacbe34c793aa1f97d97f57a36cc05416fdf2 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 28 Mar 2024 07:56:36 +0000 Subject: [PATCH 6/7] fix features --- utils_ml.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/utils_ml.py b/utils_ml.py index 767f7db..e8598d3 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -28,7 +28,7 @@ import warnings def load_train_test(type_of_activity): - BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}" + BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}" File_path_train = BUCKET + "/Train_set.csv" File_path_test = BUCKET + "/Test_set.csv" @@ -83,9 +83,7 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): def features_target_split(dataset_train, dataset_test): - features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', - 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', - 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr'] + features_l = ['] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] @@ -94,30 +92,29 @@ def features_target_split(dataset_train, dataset_test): return X_train, X_test, y_train, y_test -def preprocess(type_of_model): +def preprocess(type_of_model, type_of_activity): + + numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', + 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', + 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', + 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets'] + + binary_features = ['gender_female', 'gender_male', 'country_fr', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', + 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue', + 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in'] + if type_of_model=='premium': - numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', - 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', - 'nb_campaigns', 'nb_campaigns_opened'] + if type_of_activity=='musique': + binary_features.extend(['target_optin', 'target_newsletter']) + elif type_of_activity=='sport': + binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne']) + else: + binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter']) + - binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] - categorical_features = ['opt_in'] - - else: - numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', - 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', - 'nb_campaigns', 'nb_campaigns_opened'] - - binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] - categorical_features = ['opt_in'] - numeric_transformer = Pipeline(steps=[ ("scaler", StandardScaler()) ]) - categorical_features = ['opt_in'] - categorical_transformer = Pipeline(steps=[ - ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) - ]) binary_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), @@ -125,7 +122,6 @@ def preprocess(type_of_model): preproc = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, categorical_features), ("bin", binary_transformer, binary_features) ] ) From 122c4c1f825548d56c5b355d998b441de904faef Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 28 Mar 2024 08:35:02 +0000 Subject: [PATCH 7/7] fix features --- 0_5_Machine_Learning.py | 2 +- utils_ml.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 79ee8c1..b893aed 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -56,7 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} -preproc = preprocess(type_of_model) +preproc = preprocess(type_of_model, type_of_activity) # Object for storing results model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) diff --git a/utils_ml.py b/utils_ml.py index e8598d3..1955ef9 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -83,7 +83,14 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): def features_target_split(dataset_train, dataset_test): - features_l = ['] + features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open', + 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', + 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', + 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male', + 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', + 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue', + 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille', + 'target_jeune', 'target_abonne'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] @@ -97,12 +104,15 @@ def preprocess(type_of_model, type_of_activity): numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', - 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets'] + 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open'] - binary_features = ['gender_female', 'gender_male', 'country_fr', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', + binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue', 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in'] + if type_of_activity=='musee': + binary_features.pop('time_to_open') + if type_of_model=='premium': if type_of_activity=='musique': binary_features.extend(['target_optin', 'target_newsletter']) @@ -113,6 +123,7 @@ def preprocess(type_of_model, type_of_activity): numeric_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="constant", fill_value=0)), ("scaler", StandardScaler()) ])