2024-03-18 17:22:29 +01:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
import os
|
|
|
|
import s3fs
|
|
|
|
import re
|
|
|
|
import io
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
|
|
|
from sklearn.utils import class_weight
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
from sklearn.compose import ColumnTransformer
|
|
|
|
from sklearn.calibration import calibration_curve
|
|
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
|
|
from sklearn.impute import SimpleImputer
|
|
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
|
|
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
|
|
|
import seaborn as sns
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
|
|
|
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
|
|
|
|
|
|
|
import pickle
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
|
|
|
|
def load_train_test(type_of_activity):
|
|
|
|
BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
|
|
|
|
File_path_train = BUCKET + "/Train_set.csv"
|
|
|
|
File_path_test = BUCKET + "/Test_set.csv"
|
|
|
|
|
|
|
|
with fs.open( File_path_train, mode="rb") as file_in:
|
|
|
|
dataset_train = pd.read_csv(file_in, sep=",")
|
|
|
|
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
|
|
|
|
|
|
|
|
with fs.open(File_path_test, mode="rb") as file_in:
|
|
|
|
dataset_test = pd.read_csv(file_in, sep=",")
|
|
|
|
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
|
|
|
|
|
|
|
|
return dataset_train, dataset_test
|
|
|
|
|
|
|
|
|
|
|
|
def save_file_s3(File_name, type_of_activity, model):
|
|
|
|
image_buffer = io.BytesIO()
|
|
|
|
plt.savefig(image_buffer, format='png')
|
|
|
|
image_buffer.seek(0)
|
|
|
|
FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
|
|
|
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
|
|
|
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
|
|
|
s3_file.write(image_buffer.read())
|
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
|
|
|
def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):
|
|
|
|
if model_path:
|
2024-03-20 13:06:47 +01:00
|
|
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'
|
2024-03-18 17:22:29 +01:00
|
|
|
else:
|
|
|
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'
|
|
|
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
|
|
|
result_set.to_csv(file_out, index = False)
|
|
|
|
|
|
|
|
|
2024-03-20 13:06:47 +01:00
|
|
|
def save_model_s3(File_name, type_of_activity, model, classifier):
|
|
|
|
model_bytes = pickle.dumps(classifier)
|
|
|
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'
|
|
|
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
|
|
|
f.write(model_bytes)
|
|
|
|
|
|
|
|
|
2024-03-18 17:22:29 +01:00
|
|
|
def compute_recall(group):
|
|
|
|
return recall_score(group['y_has_purchased'], group['prediction'])
|
|
|
|
|
|
|
|
|
|
|
|
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
|
|
|
|
test = dataset_test.copy()
|
|
|
|
test['prediction'] = y_pred
|
|
|
|
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
|
|
|
|
recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
|
|
|
|
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)
|
|
|
|
|
|
|
|
|
|
|
|
def features_target_split(dataset_train, dataset_test):
|
|
|
|
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
|
|
|
'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',
|
|
|
|
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
|
|
|
X_train = dataset_train[features_l]
|
|
|
|
y_train = dataset_train[['y_has_purchased']]
|
|
|
|
|
|
|
|
X_test = dataset_test[features_l]
|
|
|
|
y_test = dataset_test[['y_has_purchased']]
|
|
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
|
|
|
|
|
|
def draw_confusion_matrix(y_test, y_pred, model):
|
|
|
|
conf_matrix = confusion_matrix(y_test, y_pred)
|
|
|
|
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
|
|
|
|
plt.xlabel('Predicted')
|
|
|
|
plt.ylabel('Actual')
|
|
|
|
plt.title('Confusion Matrix')
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("Confusion_matrix_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
def draw_roc_curve(X_test, y_pred_prob, model):
|
|
|
|
# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
|
|
|
|
|
|
|
|
# Calcul de l'aire sous la courbe ROC (AUC)
|
|
|
|
roc_auc = auc(fpr, tpr)
|
|
|
|
|
|
|
|
plt.figure(figsize = (14, 8))
|
|
|
|
plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
|
|
|
|
plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
|
|
|
|
plt.grid(color='gray', linestyle='--', linewidth=0.5)
|
|
|
|
plt.xlabel("False Positive Rate")
|
|
|
|
plt.ylabel("True Positive Rate")
|
|
|
|
plt.title("ROC Curve", size=18)
|
|
|
|
plt.legend(loc="lower right")
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("Roc_curve_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
def draw_calibration_curve(X_test, y_pred_prob, model):
|
|
|
|
frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
|
|
|
|
|
|
|
|
# Plot the calibration curve
|
|
|
|
plt.plot(mean_pred, frac_pos, 's-', label=model)
|
|
|
|
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
|
|
|
plt.xlabel('Mean predicted value')
|
|
|
|
plt.ylabel('Fraction of positive predictions')
|
|
|
|
plt.title("Calibration Curve")
|
|
|
|
plt.legend()
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("Calib_curve_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
def draw_features_importance(pipeline, model, randomF = False):
|
|
|
|
if randomF:
|
|
|
|
coefficients = pipeline.named_steps[model].feature_importances_
|
|
|
|
else:
|
|
|
|
coefficients = pipeline.named_steps[model].coef_[0]
|
|
|
|
|
|
|
|
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
|
|
|
|
# Tracer l'importance des caractéristiques
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
plt.barh(feature_names, coefficients, color='skyblue')
|
|
|
|
plt.xlabel("Features' Importance")
|
|
|
|
plt.ylabel('Caractéristiques')
|
|
|
|
plt.title("Features' Importance")
|
|
|
|
plt.grid(True)
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("Features_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
def draw_prob_distribution(y_pred_prob, model):
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
|
|
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
|
|
|
|
|
|
|
|
plt.xlim(0, 1)
|
|
|
|
plt.ylim(0, None)
|
|
|
|
|
|
|
|
plt.title('Histogramme des probabilités pour la classe 1')
|
|
|
|
plt.xlabel('Probability')
|
|
|
|
plt.ylabel('Frequency')
|
|
|
|
plt.grid(True)
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("prob_dist_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
def draw_prob_distribution_companies(y_pred_prob, model):
|
|
|
|
test = dataset_test.copy()
|
|
|
|
test['probability to buy'] = y_pred_prob
|
|
|
|
test['company'] = test['customer_id'].str.split('_', expand=True)[0]
|
2024-03-18 20:38:01 +01:00
|
|
|
sns.histplot(data=test, x='probability to buy', hue='company', element='step',
|
2024-03-18 17:22:29 +01:00
|
|
|
stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
|
|
|
|
plt.xlim(0, 1)
|
|
|
|
plt.ylim(0, None)
|
|
|
|
plt.title('Histogram of probabilities for class 1 by company')
|
|
|
|
plt.xlabel('Probability')
|
|
|
|
plt.ylabel('Frequency')
|
|
|
|
plt.grid(True)
|
|
|
|
plt.show()
|
|
|
|
save_file_s3("prob_dist_companies_", type_of_activity, model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
|
|
|
|
pipeline = Pipeline(steps=[
|
|
|
|
('preprocessor', preproc),
|
|
|
|
('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
|
|
|
|
max_iter=5000, n_jobs=-1))
|
|
|
|
])
|
|
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
|
|
|
|
y_pred = pipeline.predict(X_test)
|
|
|
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
|
|
|
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
|
|
|
model = "LogisticRegression_Benchmark"
|
|
|
|
result = pd.DataFrame({"Model" : [model],
|
|
|
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
|
|
|
"Recall" : [recall_score(y_test, y_pred)],
|
|
|
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
|
|
|
"AUC" : [auc(fpr, tpr)]}
|
|
|
|
)
|
|
|
|
model_result = pd.concat([model_result, result])
|
2024-03-18 20:38:01 +01:00
|
|
|
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
2024-03-18 17:22:29 +01:00
|
|
|
|
|
|
|
draw_confusion_matrix(y_test, y_pred, model)
|
|
|
|
draw_roc_curve(X_test, y_pred_prob, model)
|
|
|
|
draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
|
|
|
|
draw_prob_distribution(y_pred_prob, model)
|
|
|
|
draw_prob_distribution_companies(y_pred_prob, model)
|
|
|
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
2024-03-20 13:06:47 +01:00
|
|
|
save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)
|
2024-03-18 17:22:29 +01:00
|
|
|
return model_result
|
|
|
|
|
|
|
|
|
|
|
|
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
|
|
|
|
y_train = y_train['y_has_purchased']
|
|
|
|
param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
|
|
|
|
'LogisticRegression_cv__penalty': ['l1', 'l2'],
|
|
|
|
'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}
|
|
|
|
pipeline = Pipeline(steps=[
|
|
|
|
('preprocessor', preproc),
|
|
|
|
('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))
|
|
|
|
])
|
|
|
|
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
|
|
|
n_jobs=-1)
|
|
|
|
|
|
|
|
grid_search.fit(X_train, y_train)
|
|
|
|
y_pred = grid_search.predict(X_test)
|
|
|
|
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
|
|
|
best_pipeline = grid_search.best_estimator_
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
|
|
|
model = "LogisticRegression_cv"
|
|
|
|
result = pd.DataFrame({"Model" : [model],
|
|
|
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
|
|
|
"Recall" : [recall_score(y_test, y_pred)],
|
|
|
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
|
|
|
"AUC" : [auc(fpr, tpr)]}
|
|
|
|
)
|
|
|
|
model_result = pd.concat([model_result, result])
|
2024-03-18 20:38:01 +01:00
|
|
|
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
2024-03-18 17:22:29 +01:00
|
|
|
|
|
|
|
draw_confusion_matrix(y_test, y_pred, model)
|
|
|
|
draw_roc_curve(X_test, y_pred_prob, model)
|
|
|
|
draw_features_importance(best_pipeline, 'LogisticRegression_cv')
|
|
|
|
draw_prob_distribution(y_pred_prob, model)
|
|
|
|
draw_prob_distribution_companies(y_pred_prob, model)
|
|
|
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
2024-03-20 13:06:47 +01:00
|
|
|
save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)
|
2024-03-18 17:22:29 +01:00
|
|
|
return model_result
|
|
|
|
|
|
|
|
|
|
|
|
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
|
|
|
|
pipeline = Pipeline(steps=[
|
|
|
|
('preprocessor', preproc),
|
|
|
|
('randomF', RandomForestClassifier(class_weight = weight_dict,
|
|
|
|
n_jobs=-1))
|
|
|
|
])
|
|
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
|
|
|
|
y_pred = pipeline.predict(X_test)
|
|
|
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
|
|
|
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
|
|
|
model = "randomF"
|
|
|
|
result = pd.DataFrame({"Model" : [model],
|
|
|
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
|
|
|
"Recall" : [recall_score(y_test, y_pred)],
|
|
|
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
|
|
|
"AUC" : [auc(fpr, tpr)]}
|
|
|
|
)
|
|
|
|
model_result = pd.concat([model_result, result])
|
2024-03-18 20:38:01 +01:00
|
|
|
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
2024-03-18 17:22:29 +01:00
|
|
|
|
|
|
|
draw_confusion_matrix(y_test, y_pred, model)
|
|
|
|
draw_roc_curve(X_test, y_pred_prob, model)
|
|
|
|
draw_features_importance(pipeline, 'randomF', randomF=True)
|
|
|
|
draw_prob_distribution(y_pred_prob, model)
|
|
|
|
draw_prob_distribution_companies(y_pred_prob, model)
|
|
|
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
2024-03-20 13:06:47 +01:00
|
|
|
save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)
|
2024-03-18 17:22:29 +01:00
|
|
|
return model_result
|
|
|
|
|
|
|
|
|
|
|
|
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
|
|
|
|
y_train = y_train['y_has_purchased']
|
|
|
|
param_grid = {
|
2024-03-19 12:43:44 +01:00
|
|
|
'randomF_cv__n_estimators': [100, 300],
|
|
|
|
'randomF_cv__max_features': ['sqrt', 'log2'],
|
|
|
|
'randomF_cv__min_samples_split': [2, 10],
|
|
|
|
'randomF_cv__min_samples_leaf': [1, 4],
|
|
|
|
'randomF_cv__class_weight': [weight_dict]
|
2024-03-18 17:22:29 +01:00
|
|
|
}
|
|
|
|
pipeline = Pipeline(steps=[
|
|
|
|
('preprocessor', preproc),
|
|
|
|
('randomF_cv', RandomForestClassifier(n_jobs=-1))
|
|
|
|
])
|
|
|
|
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
|
|
|
n_jobs=-1)
|
|
|
|
|
|
|
|
grid_search.fit(X_train, y_train)
|
|
|
|
y_pred = grid_search.predict(X_test)
|
|
|
|
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
|
|
|
best_pipeline = grid_search.best_estimator_
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
|
|
|
model = "randomF_cv"
|
|
|
|
result = pd.DataFrame({"Model" : [model],
|
|
|
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
|
|
|
"Recall" : [recall_score(y_test, y_pred)],
|
|
|
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
|
|
|
"AUC" : [auc(fpr, tpr)]}
|
|
|
|
)
|
|
|
|
model_result = pd.concat([model_result, result])
|
2024-03-18 20:38:01 +01:00
|
|
|
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
2024-03-18 17:22:29 +01:00
|
|
|
|
|
|
|
draw_confusion_matrix(y_test, y_pred, model)
|
|
|
|
draw_roc_curve(X_test, y_pred_prob, model)
|
|
|
|
draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
|
|
|
|
draw_prob_distribution(y_pred_prob, model)
|
|
|
|
draw_prob_distribution_companies(y_pred_prob, model)
|
|
|
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
2024-03-20 13:06:47 +01:00
|
|
|
save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
|
2024-03-18 17:22:29 +01:00
|
|
|
return model_result
|
|
|
|
|
|
|
|
|
|
|
|
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
|
|
|
|
unique_classes, counts = np.unique(y_train, return_counts=True)
|
|
|
|
class_priors = counts / counts.sum()
|
|
|
|
pipeline = Pipeline(steps=[
|
|
|
|
('preprocessor', preproc),
|
|
|
|
('Naive_Bayes', GaussianNB(priors=class_priors))
|
|
|
|
])
|
|
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
|
|
|
|
y_pred = pipeline.predict(X_test)
|
|
|
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
|
|
|
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
|
|
|
model = "Naive_Bayes"
|
|
|
|
result = pd.DataFrame({"Model" : [model],
|
|
|
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
|
|
|
"Recall" : [recall_score(y_test, y_pred)],
|
|
|
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
|
|
|
"AUC" : [auc(fpr, tpr)]}
|
|
|
|
)
|
|
|
|
model_result = pd.concat([model_result, result])
|
|
|
|
draw_confusion_matrix(y_test, y_pred, model)
|
|
|
|
draw_roc_curve(X_test, y_pred_prob, model)
|
|
|
|
draw_prob_distribution(y_pred_prob, model)
|
|
|
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
2024-03-20 13:06:47 +01:00
|
|
|
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)
|
2024-03-18 17:22:29 +01:00
|
|
|
return model_result
|