From 9e5e364aa374f5838791dec671c4fd8a44227b3f Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Mon, 18 Mar 2024 19:38:01 +0000 Subject: [PATCH] add steps --- 0_5_Machine_Learning.py | 5 +++++ utils_ml.py | 10 +++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index f0828c4..b78cf2a 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -84,14 +84,19 @@ model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", # Naive Bayes model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result) +print("Naive Bayes : Done") # Logistic Regression model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) +print("Logistic : Done") model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) +print("Logistic CV : Done") # Random Forest model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result) +print("Random Forest : Done") model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) +print("Random Forest CV: Done") # Save result save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index 6504c7a..d1b0aa6 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -167,7 +167,7 @@ def draw_prob_distribution_companies(y_pred_prob, model): test = dataset_test.copy() test['probability to buy'] = y_pred_prob test['company'] = test['customer_id'].str.split('_', expand=True)[0] - sns.histplot(data=dataset_test, x='probability to buy', hue='company', element='step', + sns.histplot(data=test, x='probability to buy', hue='company', element='step', stat='count', common_norm=False, bins=10, palette='Set1', alpha=1) plt.xlim(0, 1) plt.ylim(0, None) @@ -200,7 +200,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - compute_recall_companies(dataset_test, y_pred, model) + #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -236,7 +236,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -267,7 +267,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) @@ -308,7 +308,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) - compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + #compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model)