From 9e5e364aa374f5838791dec671c4fd8a44227b3f Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Mon, 18 Mar 2024 19:38:01 +0000
Subject: [PATCH] add steps

---
 0_5_Machine_Learning.py |  5 +++++
 utils_ml.py             | 10 +++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py
index f0828c4..b78cf2a 100644
--- a/0_5_Machine_Learning.py
+++ b/0_5_Machine_Learning.py
@@ -84,14 +84,19 @@ model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score",
 
 # Naive Bayes
 model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
+print("Naive Bayes : Done")
 
 # Logistic Regression
 model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
+print("Logistic : Done")
 model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
+print("Logistic CV : Done")
 
 # Random Forest
 model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
+print("Random Forest : Done")
 model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
+print("Random Forest CV: Done")
 
 # Save result
 save_result_set_s3(model_result , "resultat", type_of_activity)
\ No newline at end of file
diff --git a/utils_ml.py b/utils_ml.py
index 6504c7a..d1b0aa6 100644
--- a/utils_ml.py
+++ b/utils_ml.py
@@ -167,7 +167,7 @@ def draw_prob_distribution_companies(y_pred_prob, model):
     test = dataset_test.copy()
     test['probability to buy'] = y_pred_prob
     test['company'] = test['customer_id'].str.split('_', expand=True)[0]
-    sns.histplot(data=dataset_test, x='probability to buy', hue='company', element='step',
+    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
              stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
     plt.xlim(0, 1)
     plt.ylim(0, None)
@@ -200,7 +200,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
                        "AUC" : [auc(fpr, tpr)]}
                        )
     model_result = pd.concat([model_result, result])
-    compute_recall_companies(dataset_test, y_pred, model)
+    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
     
     draw_confusion_matrix(y_test, y_pred, model)
     draw_roc_curve(X_test, y_pred_prob, model)
@@ -236,7 +236,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
                        "AUC" : [auc(fpr, tpr)]}
                        )
     model_result = pd.concat([model_result, result])
-    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
     
     draw_confusion_matrix(y_test, y_pred, model)
     draw_roc_curve(X_test, y_pred_prob, model)
@@ -267,7 +267,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
                        "AUC" : [auc(fpr, tpr)]}
                        )
     model_result = pd.concat([model_result, result])
-    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
     
     draw_confusion_matrix(y_test, y_pred, model)
     draw_roc_curve(X_test, y_pred_prob, model)
@@ -308,7 +308,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
                        "AUC" : [auc(fpr, tpr)]}
                        )
     model_result = pd.concat([model_result, result])
-    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
     
     draw_confusion_matrix(y_test, y_pred, model)
     draw_roc_curve(X_test, y_pred_prob, model)