diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index b78cf2a..6c9ca62 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -89,6 +89,7 @@ print("Naive Bayes : Done") # Logistic Regression model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) print("Logistic : Done") +""" model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) print("Logistic CV : Done") @@ -97,6 +98,6 @@ model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, mode print("Random Forest : Done") model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) print("Random Forest CV: Done") - +""" # Save result save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index 51fbb4e..8fb74d5 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -56,13 +56,20 @@ def save_file_s3(File_name, type_of_activity, model): def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): if model_path: - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}" + File_name + '.csv' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv' else: FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: result_set.to_csv(file_out, index = False) +def save_model_s3(File_name, type_of_activity, model, classifier): + model_bytes = pickle.dumps(classifier) + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl' + with fs.open(FILE_PATH_OUT_S3, 'wb') as f: + f.write(model_bytes) + + def compute_recall(group): return recall_score(group['y_has_purchased'], group['prediction']) @@ -208,6 +215,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) + save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline) return model_result @@ -244,6 +252,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) + save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search) return model_result @@ -275,6 +284,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) + save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline) return model_result @@ -315,6 +325,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) + save_model_s3('randomF_cv', type_of_activity, model, gridsearch) return model_result @@ -343,4 +354,5 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) + save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline) return model_result \ No newline at end of file