2024-03-28 09:40:04 +01:00
2 changed files with 10 additions and 8 deletions
--- a/0_5_Machine_Learning.py
+++ b/0_5_Machine_Learning.py
@ -89,7 +89,7 @@ print("Naive Bayes : Done")
 # Logistic Regression
 model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
 print("Logistic : Done")
-"""
+
 model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
 print("Logistic CV : Done")
@ -98,6 +98,6 @@ model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, mode
 print("Random Forest : Done")
 model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
 print("Random Forest CV: Done")
-"""
+
 # Save result
 save_result_set_s3(model_result , "resultat", type_of_activity)
--- a/utils_ml.py
+++ b/utils_ml.py
@ -78,7 +78,7 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
    test = dataset_test.copy()
    test['prediction'] = y_pred
    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
-    recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
+    recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)
@ -207,7 +207,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
-    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
@ -244,7 +244,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
-    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
@ -276,7 +276,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
-    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
@ -317,7 +317,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
-    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
+    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
@ -325,7 +325,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
-    save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
+    save_model_s3('randomF_cv', type_of_activity, model, grid_search)
    return model_result
@ -350,6 +350,8 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_prob_distribution(y_pred_prob, model)