diff --git a/utils_ml.py b/utils_ml.py new file mode 100644 index 0000000..6504c7a --- /dev/null +++ b/utils_ml.py @@ -0,0 +1,347 @@ +import pandas as pd +import numpy as np +import os +import s3fs +import re +import io +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score +from sklearn.utils import class_weight +from sklearn.neighbors import KNeighborsClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.calibration import calibration_curve +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler +from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score +from sklearn.exceptions import ConvergenceWarning, DataConversionWarning + +import pickle +import warnings + + +def load_train_test(type_of_activity): + BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}" + File_path_train = BUCKET + "/Train_set.csv" + File_path_test = BUCKET + "/Test_set.csv" + + with fs.open( File_path_train, mode="rb") as file_in: + dataset_train = pd.read_csv(file_in, sep=",") + # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0) + + with fs.open(File_path_test, mode="rb") as file_in: + dataset_test = pd.read_csv(file_in, sep=",") + # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0) + + return dataset_train, dataset_test + + +def save_file_s3(File_name, type_of_activity, model): + image_buffer = io.BytesIO() + plt.savefig(image_buffer, format='png') + image_buffer.seek(0) + FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' + with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: + s3_file.write(image_buffer.read()) + plt.close() + + +def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): + if model_path: + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}" + File_name + '.csv' + else: + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' + with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + result_set.to_csv(file_out, index = False) + + +def compute_recall(group): + return recall_score(group['y_has_purchased'], group['prediction']) + + +def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): + test = dataset_test.copy() + test['prediction'] = y_pred + test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] + recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score') + save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) + + +def features_target_split(dataset_train, dataset_test): + features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', + 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', + 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] + X_train = dataset_train[features_l] + y_train = dataset_train[['y_has_purchased']] + + X_test = dataset_test[features_l] + y_test = dataset_test[['y_has_purchased']] + return X_train, X_test, y_train, y_test + + +def draw_confusion_matrix(y_test, y_pred, model): + conf_matrix = confusion_matrix(y_test, y_pred) + sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) + plt.xlabel('Predicted') + plt.ylabel('Actual') + plt.title('Confusion Matrix') + plt.show() + save_file_s3("Confusion_matrix_", type_of_activity, model) + + +def draw_roc_curve(X_test, y_pred_prob, model): + # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR) + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1) + + # Calcul de l'aire sous la courbe ROC (AUC) + roc_auc = auc(fpr, tpr) + + plt.figure(figsize = (14, 8)) + plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc) + plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--") + plt.grid(color='gray', linestyle='--', linewidth=0.5) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("ROC Curve", size=18) + plt.legend(loc="lower right") + plt.show() + save_file_s3("Roc_curve_", type_of_activity, model) + + +def draw_calibration_curve(X_test, y_pred_prob, model): + frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10) + + # Plot the calibration curve + plt.plot(mean_pred, frac_pos, 's-', label=model) + plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated') + plt.xlabel('Mean predicted value') + plt.ylabel('Fraction of positive predictions') + plt.title("Calibration Curve") + plt.legend() + plt.show() + save_file_s3("Calib_curve_", type_of_activity, model) + + +def draw_features_importance(pipeline, model, randomF = False): + if randomF: + coefficients = pipeline.named_steps[model].feature_importances_ + else: + coefficients = pipeline.named_steps[model].coef_[0] + + feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() + # Tracer l'importance des caractéristiques + plt.figure(figsize=(10, 6)) + plt.barh(feature_names, coefficients, color='skyblue') + plt.xlabel("Features' Importance") + plt.ylabel('Caractéristiques') + plt.title("Features' Importance") + plt.grid(True) + plt.show() + save_file_s3("Features_", type_of_activity, model) + + +def draw_prob_distribution(y_pred_prob, model): + plt.figure(figsize=(8, 6)) + plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) + + plt.xlim(0, 1) + plt.ylim(0, None) + + plt.title('Histogramme des probabilités pour la classe 1') + plt.xlabel('Probability') + plt.ylabel('Frequency') + plt.grid(True) + plt.show() + save_file_s3("prob_dist_", type_of_activity, model) + + +def draw_prob_distribution_companies(y_pred_prob, model): + test = dataset_test.copy() + test['probability to buy'] = y_pred_prob + test['company'] = test['customer_id'].str.split('_', expand=True)[0] + sns.histplot(data=dataset_test, x='probability to buy', hue='company', element='step', + stat='count', common_norm=False, bins=10, palette='Set1', alpha=1) + plt.xlim(0, 1) + plt.ylim(0, None) + plt.title('Histogram of probabilities for class 1 by company') + plt.xlabel('Probability') + plt.ylabel('Frequency') + plt.grid(True) + plt.show() + save_file_s3("prob_dist_companies_", type_of_activity, model) + + + +def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): + pipeline = Pipeline(steps=[ + ('preprocessor', preproc), + ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict, + max_iter=5000, n_jobs=-1)) +]) + pipeline.fit(X_train, y_train) + + y_pred = pipeline.predict(X_test) + y_pred_prob = pipeline.predict_proba(X_test)[:, 1] + + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) + model = "LogisticRegression_Benchmark" + result = pd.DataFrame({"Model" : [model], + "Accuracy" : [accuracy_score(y_test, y_pred)], + "Recall" : [recall_score(y_test, y_pred)], + "F1_score" : [f1_score(y_test, y_pred, average="macro")], + "AUC" : [auc(fpr, tpr)]} + ) + model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, model) + + draw_confusion_matrix(y_test, y_pred, model) + draw_roc_curve(X_test, y_pred_prob, model) + draw_features_importance(pipeline, 'LogisticRegression_Benchmark') + draw_prob_distribution(y_pred_prob, model) + draw_prob_distribution_companies(y_pred_prob, model) + draw_calibration_curve(X_test, y_pred_prob, model) + return model_result + + +def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): + y_train = y_train['y_has_purchased'] + param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2), + 'LogisticRegression_cv__penalty': ['l1', 'l2'], + 'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} + pipeline = Pipeline(steps=[ + ('preprocessor', preproc), + ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000)) +]) + grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise', + n_jobs=-1) + + grid_search.fit(X_train, y_train) + y_pred = grid_search.predict(X_test) + y_pred_prob = grid_search.predict_proba(X_test)[:, 1] + best_pipeline = grid_search.best_estimator_ + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) + model = "LogisticRegression_cv" + result = pd.DataFrame({"Model" : [model], + "Accuracy" : [accuracy_score(y_test, y_pred)], + "Recall" : [recall_score(y_test, y_pred)], + "F1_score" : [f1_score(y_test, y_pred, average="macro")], + "AUC" : [auc(fpr, tpr)]} + ) + model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + + draw_confusion_matrix(y_test, y_pred, model) + draw_roc_curve(X_test, y_pred_prob, model) + draw_features_importance(best_pipeline, 'LogisticRegression_cv') + draw_prob_distribution(y_pred_prob, model) + draw_prob_distribution_companies(y_pred_prob, model) + draw_calibration_curve(X_test, y_pred_prob, model) + return model_result + + +def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): + pipeline = Pipeline(steps=[ + ('preprocessor', preproc), + ('randomF', RandomForestClassifier(class_weight = weight_dict, + n_jobs=-1)) +]) + pipeline.fit(X_train, y_train) + + y_pred = pipeline.predict(X_test) + y_pred_prob = pipeline.predict_proba(X_test)[:, 1] + + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) + model = "randomF" + result = pd.DataFrame({"Model" : [model], + "Accuracy" : [accuracy_score(y_test, y_pred)], + "Recall" : [recall_score(y_test, y_pred)], + "F1_score" : [f1_score(y_test, y_pred, average="macro")], + "AUC" : [auc(fpr, tpr)]} + ) + model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + + draw_confusion_matrix(y_test, y_pred, model) + draw_roc_curve(X_test, y_pred_prob, model) + draw_features_importance(pipeline, 'randomF', randomF=True) + draw_prob_distribution(y_pred_prob, model) + draw_prob_distribution_companies(y_pred_prob, model) + draw_calibration_curve(X_test, y_pred_prob, model) + return model_result + + +def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): + y_train = y_train['y_has_purchased'] + param_grid = { + 'randomF_cv__n_estimators': [100, 200, 300], + 'randomF_cv__max_features': ['sqrt', 'log2', None], + 'randomF_cv__min_samples_split': [2, 5, 10], + 'randomF_cv__min_samples_leaf': [1, 2, 4], + 'randomF_cv__bootstrap': [True, False], + 'randomF_cv__class_weight': [None, weight_dict] +} + pipeline = Pipeline(steps=[ + ('preprocessor', preproc), + ('randomF_cv', RandomForestClassifier(n_jobs=-1)) +]) + grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise', + n_jobs=-1) + + grid_search.fit(X_train, y_train) + y_pred = grid_search.predict(X_test) + y_pred_prob = grid_search.predict_proba(X_test)[:, 1] + best_pipeline = grid_search.best_estimator_ + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) + model = "randomF_cv" + result = pd.DataFrame({"Model" : [model], + "Accuracy" : [accuracy_score(y_test, y_pred)], + "Recall" : [recall_score(y_test, y_pred)], + "F1_score" : [f1_score(y_test, y_pred, average="macro")], + "AUC" : [auc(fpr, tpr)]} + ) + model_result = pd.concat([model_result, result]) + compute_recall_companies(dataset_test, y_pred, type_of_activity, model) + + draw_confusion_matrix(y_test, y_pred, model) + draw_roc_curve(X_test, y_pred_prob, model) + draw_features_importance(best_pipeline, 'randomF_cv', randomF=True) + draw_prob_distribution(y_pred_prob, model) + draw_prob_distribution_companies(y_pred_prob, model) + draw_calibration_curve(X_test, y_pred_prob, model) + return model_result + + +def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result): + unique_classes, counts = np.unique(y_train, return_counts=True) + class_priors = counts / counts.sum() + pipeline = Pipeline(steps=[ + ('preprocessor', preproc), + ('Naive_Bayes', GaussianNB(priors=class_priors)) +]) + pipeline.fit(X_train, y_train) + + y_pred = pipeline.predict(X_test) + y_pred_prob = pipeline.predict_proba(X_test)[:, 1] + + fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) + model = "Naive_Bayes" + result = pd.DataFrame({"Model" : [model], + "Accuracy" : [accuracy_score(y_test, y_pred)], + "Recall" : [recall_score(y_test, y_pred)], + "F1_score" : [f1_score(y_test, y_pred, average="macro")], + "AUC" : [auc(fpr, tpr)]} + ) + model_result = pd.concat([model_result, result]) + draw_confusion_matrix(y_test, y_pred, model) + draw_roc_curve(X_test, y_pred_prob, model) + draw_prob_distribution(y_pred_prob, model) + draw_calibration_curve(X_test, y_pred_prob, model) + return model_result \ No newline at end of file