diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 6c9ca62..f6c162b 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -89,7 +89,7 @@ print("Naive Bayes : Done") # Logistic Regression model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) print("Logistic : Done") -""" + model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) print("Logistic CV : Done") @@ -98,6 +98,6 @@ model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, mode print("Random Forest : Done") model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) print("Random Forest CV: Done") -""" + # Save result save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index e801964..17ac85a 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -78,7 +78,7 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): test = dataset_test.copy() test['prediction'] = y_pred test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] - recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score') + recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) @@ -207,7 +207,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -244,7 +244,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -276,7 +276,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -317,7 +317,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -325,7 +325,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_cv', type_of_activity, model, gridsearch) + save_model_s3('randomF_cv', type_of_activity, model, grid_search) return model_result @@ -350,6 +350,8 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)