import pandas as pd import numpy as np import os import s3fs import re import io from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score from sklearn.utils import class_weight from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.calibration import calibration_curve from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score from sklearn.exceptions import ConvergenceWarning, DataConversionWarning import pickle import warnings def load_train_test(type_of_activity): BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}" File_path_train = BUCKET + "/Train_set.csv" File_path_test = BUCKET + "/Test_set.csv" with fs.open( File_path_train, mode="rb") as file_in: dataset_train = pd.read_csv(file_in, sep=",") # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0) with fs.open(File_path_test, mode="rb") as file_in: dataset_test = pd.read_csv(file_in, sep=",") # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0) return dataset_train, dataset_test def save_file_s3(File_name, type_of_activity, type_of_model, model): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False): if model_path: FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv' else: FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv' with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: result_set.to_csv(file_out, index = False) def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier): model_bytes = pickle.dumps(classifier) FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl' with fs.open(FILE_PATH_OUT_S3, 'wb') as f: f.write(model_bytes) def compute_recall(group): return recall_score(group['y_has_purchased'], group['prediction']) def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): test = dataset_test.copy() test['prediction'] = y_pred test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True) def features_target_split(dataset_train, dataset_test): features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] X_test = dataset_test[features_l] y_test = dataset_test[['y_has_purchased']] return X_train, X_test, y_train, y_test def preprocess(type_of_model): if type_of_model=='premium': numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened'] binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] categorical_features = ['opt_in'] else: numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened'] binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] categorical_features = ['opt_in'] numeric_transformer = Pipeline(steps=[ ("scaler", StandardScaler()) ]) categorical_features = ['opt_in'] categorical_transformer = Pipeline(steps=[ ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ]) binary_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ]) preproc = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ("bin", binary_transformer, binary_features) ] ) return preproc def draw_confusion_matrix(y_test, y_pred, model): conf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) plt.xlabel('Predicted') plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model) def draw_roc_curve(X_test, y_pred_prob, model): # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR) fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1) # Calcul de l'aire sous la courbe ROC (AUC) roc_auc = auc(fpr, tpr) plt.figure(figsize = (14, 8)) plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc) plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--") plt.grid(color='gray', linestyle='--', linewidth=0.5) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC Curve", size=18) plt.legend(loc="lower right") plt.show() save_file_s3("Roc_curve_", type_of_activity, type_of_model, model) def draw_calibration_curve(X_test, y_pred_prob, model): frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10) # Plot the calibration curve plt.plot(mean_pred, frac_pos, 's-', label=model) plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated') plt.xlabel('Mean predicted value') plt.ylabel('Fraction of positive predictions') plt.title("Calibration Curve") plt.legend() plt.show() save_file_s3("Calib_curve_", type_of_activity, type_of_model, model) def draw_features_importance(pipeline, model, randomF = False): if randomF: coefficients = pipeline.named_steps[model].feature_importances_ else: coefficients = pipeline.named_steps[model].coef_[0] feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() # Tracer l'importance des caractéristiques plt.figure(figsize=(12, 8)) plt.barh(feature_names, coefficients, color='skyblue') plt.xlabel("Features' Importance") plt.ylabel('Caractéristiques') plt.title("Features' Importance") plt.grid(True) plt.show() save_file_s3("Features_", type_of_activity, type_of_model, model) def draw_prob_distribution(y_pred_prob, model): plt.figure(figsize=(10, 8)) plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) plt.xlim(0, 1) plt.ylim(0, None) plt.title('Histogramme des probabilités pour la classe 1') plt.xlabel('Probability') plt.ylabel('Frequency') plt.grid(True) plt.show() save_file_s3("prob_dist_", type_of_activity, type_of_model, model) def draw_prob_distribution_companies(y_pred_prob, model): test = dataset_test.copy() test['probability to buy'] = y_pred_prob test['company'] = test['customer_id'].str.split('_', expand=True)[0] sns.histplot(data=test, x='probability to buy', hue='company', element='step', stat='count', common_norm=False, bins=10, palette='Set1', alpha=1) plt.xlim(0, 1) plt.ylim(0, None) plt.title('Histogram of probabilities for class 1 by company') plt.xlabel('Probability') plt.ylabel('Frequency') plt.grid(True) plt.show() save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model) def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): pipeline = Pipeline(steps=[ ('preprocessor', preproc), ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict, max_iter=5000, n_jobs=-1)) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) y_pred_prob = pipeline.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) model = "LogisticRegression_Benchmark" result = pd.DataFrame({"Model" : [model], "Accuracy" : [accuracy_score(y_test, y_pred)], "Recall" : [recall_score(y_test, y_pred)], "F1_score" : [f1_score(y_test, y_pred, average="macro")], "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_features_importance(pipeline, 'LogisticRegression_Benchmark') draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): y_train = y_train['y_has_purchased'] param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2), 'LogisticRegression_cv__penalty': ['l1', 'l2'], 'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} pipeline = Pipeline(steps=[ ('preprocessor', preproc), ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000)) ]) grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise', n_jobs=-1) grid_search.fit(X_train, y_train) y_pred = grid_search.predict(X_test) y_pred_prob = grid_search.predict_proba(X_test)[:, 1] best_pipeline = grid_search.best_estimator_ fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) model = "LogisticRegression_cv" result = pd.DataFrame({"Model" : [model], "Accuracy" : [accuracy_score(y_test, y_pred)], "Recall" : [recall_score(y_test, y_pred)], "F1_score" : [f1_score(y_test, y_pred, average="macro")], "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_features_importance(best_pipeline, 'LogisticRegression_cv') draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search) return model_result def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): pipeline = Pipeline(steps=[ ('preprocessor', preproc), ('randomF', RandomForestClassifier(class_weight = weight_dict, n_jobs=-1)) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) y_pred_prob = pipeline.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) model = "randomF" result = pd.DataFrame({"Model" : [model], "Accuracy" : [accuracy_score(y_test, y_pred)], "Recall" : [recall_score(y_test, y_pred)], "F1_score" : [f1_score(y_test, y_pred, average="macro")], "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_features_importance(pipeline, 'randomF', randomF=True) draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): y_train = y_train['y_has_purchased'] param_grid = { 'randomF_cv__n_estimators': [100, 300], 'randomF_cv__max_features': ['sqrt', 'log2'], 'randomF_cv__min_samples_split': [2, 10], 'randomF_cv__min_samples_leaf': [1, 4], 'randomF_cv__class_weight': [weight_dict] } pipeline = Pipeline(steps=[ ('preprocessor', preproc), ('randomF_cv', RandomForestClassifier(n_jobs=-1)) ]) grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise', n_jobs=-1) grid_search.fit(X_train, y_train) y_pred = grid_search.predict(X_test) y_pred_prob = grid_search.predict_proba(X_test)[:, 1] best_pipeline = grid_search.best_estimator_ fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) model = "randomF_cv" result = pd.DataFrame({"Model" : [model], "Accuracy" : [accuracy_score(y_test, y_pred)], "Recall" : [recall_score(y_test, y_pred)], "F1_score" : [f1_score(y_test, y_pred, average="macro")], "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_features_importance(best_pipeline, 'randomF_cv', randomF=True) draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search) return model_result def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result): unique_classes, counts = np.unique(y_train, return_counts=True) class_priors = counts / counts.sum() pipeline = Pipeline(steps=[ ('preprocessor', preproc), ('Naive_Bayes', GaussianNB(priors=class_priors)) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) y_pred_prob = pipeline.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1) model = "Naive_Bayes" result = pd.DataFrame({"Model" : [model], "Accuracy" : [accuracy_score(y_test, y_pred)], "Recall" : [recall_score(y_test, y_pred)], "F1_score" : [f1_score(y_test, y_pred, average="macro")], "AUC" : [auc(fpr, tpr)]} ) model_result = pd.concat([model_result, result]) compute_recall_companies(dataset_test, y_pred, type_of_activity, model) draw_confusion_matrix(y_test, y_pred, model) draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result