BDC-team-1/utils_ml.py

import pandas as pd
import numpy as np
import os
import s3fs
import re
import io
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning

import pickle
import warnings


def load_train_test(type_of_activity):
    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)

    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    
    return dataset_train, dataset_test


def save_file_s3(File_name, type_of_activity, model):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()


def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):
    if model_path:
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'
    else:
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        result_set.to_csv(file_out, index = False)


def save_model_s3(File_name, type_of_activity, model, classifier):
    model_bytes = pickle.dumps(classifier)
    FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
        f.write(model_bytes)
    

def compute_recall(group):
    return recall_score(group['y_has_purchased'], group['prediction'])
    

def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
    test = dataset_test.copy()
    test['prediction'] = y_pred
    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
    recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)


def features_target_split(dataset_train, dataset_test):
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
    X_train = dataset_train[features_l]
    y_train = dataset_train[['y_has_purchased']]

    X_test = dataset_test[features_l]
    y_test = dataset_test[['y_has_purchased']]
    return X_train, X_test, y_train, y_test


def draw_confusion_matrix(y_test, y_pred, model):
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    save_file_s3("Confusion_matrix_", type_of_activity, model)


def draw_roc_curve(X_test, y_pred_prob, model):
    # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
    
    # Calcul de l'aire sous la courbe ROC (AUC)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize = (14, 8))
    plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
    plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
    plt.grid(color='gray', linestyle='--', linewidth=0.5)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve", size=18)
    plt.legend(loc="lower right")
    plt.show()
    save_file_s3("Roc_curve_", type_of_activity, model)


def draw_calibration_curve(X_test, y_pred_prob, model):
    frac_pos, mean_pred = calibration_curve(y_test,  y_pred_prob, n_bins=10)

    # Plot the calibration curve
    plt.plot(mean_pred, frac_pos, 's-', label=model)
    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positive predictions')
    plt.title("Calibration Curve")
    plt.legend()
    plt.show()
    save_file_s3("Calib_curve_", type_of_activity, model)


def draw_features_importance(pipeline, model, randomF = False):
    if randomF:
        coefficients = pipeline.named_steps[model].feature_importances_
    else: 
        coefficients = pipeline.named_steps[model].coef_[0]
    
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    # Tracer l'importance des caractéristiques
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, coefficients, color='skyblue')
    plt.xlabel("Features' Importance")
    plt.ylabel('Caractéristiques')
    plt.title("Features' Importance")
    plt.grid(True)
    plt.show()
    save_file_s3("Features_", type_of_activity, model)


def draw_prob_distribution(y_pred_prob, model):
    plt.figure(figsize=(8, 6))
    plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
    
    plt.xlim(0, 1)
    plt.ylim(0, None)
    
    plt.title('Histogramme des probabilités pour la classe 1')
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    save_file_s3("prob_dist_", type_of_activity, model)


def draw_prob_distribution_companies(y_pred_prob, model):
    test = dataset_test.copy()
    test['probability to buy'] = y_pred_prob
    test['company'] = test['customer_id'].str.split('_', expand=True)[0]
    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
             stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
    plt.xlim(0, 1)
    plt.ylim(0, None)
    plt.title('Histogram of probabilities for class 1 by company')
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    save_file_s3("prob_dist_companies_", type_of_activity, model)


def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
                                  max_iter=5000, n_jobs=-1))  
])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "LogisticRegression_Benchmark"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)
    return model_result


def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
    y_train = y_train['y_has_purchased']
    param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
              'LogisticRegression_cv__penalty': ['l1', 'l2'],
               'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} 
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))  
])
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
                          n_jobs=-1)

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
    best_pipeline = grid_search.best_estimator_
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "LogisticRegression_cv"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(best_pipeline, 'LogisticRegression_cv')
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)
    return model_result


def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('randomF', RandomForestClassifier(class_weight = weight_dict,
                                  n_jobs=-1))  
])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "randomF"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(pipeline, 'randomF', randomF=True)
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)
    return model_result


def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
    y_train = y_train['y_has_purchased']
    param_grid = {
    'randomF_cv__n_estimators': [100, 300],
    'randomF_cv__max_features': ['sqrt', 'log2'],
    'randomF_cv__min_samples_split': [2, 10],
    'randomF_cv__min_samples_leaf': [1, 4],
    'randomF_cv__class_weight': [weight_dict]
}
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('randomF_cv', RandomForestClassifier(n_jobs=-1))  
])
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
                          n_jobs=-1)

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
    best_pipeline = grid_search.best_estimator_
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "randomF_cv"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
    
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
    draw_prob_distribution(y_pred_prob, model)
    draw_prob_distribution_companies(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
    return model_result


def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
    unique_classes, counts = np.unique(y_train, return_counts=True)
    class_priors = counts / counts.sum()
    pipeline = Pipeline(steps=[
    ('preprocessor', preproc),
    ('Naive_Bayes', GaussianNB(priors=class_priors))  
])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
    model = "Naive_Bayes"
    result = pd.DataFrame({"Model" : [model],
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
                       "Recall" : [recall_score(y_test, y_pred)],
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
                       "AUC" : [auc(fpr, tpr)]}
                       )
    model_result = pd.concat([model_result, result])
    draw_confusion_matrix(y_test, y_pred, model)
    draw_roc_curve(X_test, y_pred_prob, model)
    draw_prob_distribution(y_pred_prob, model)
    draw_calibration_curve(X_test, y_pred_prob, model)
    save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)
    return model_result
utils_ml 2024-03-18 17:22:29 +01:00			`import pandas as pd`
			`import numpy as np`
			`import os`
			`import s3fs`
			`import re`
			`import io`
			`from sklearn.linear_model import LogisticRegression`
			`from sklearn.ensemble import RandomForestClassifier`
			`from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score`
			`from sklearn.utils import class_weight`
			`from sklearn.neighbors import KNeighborsClassifier`
			`from sklearn.naive_bayes import GaussianNB`
			`from sklearn.pipeline import Pipeline`
			`from sklearn.compose import ColumnTransformer`
			`from sklearn.calibration import calibration_curve`
			`from sklearn.preprocessing import OneHotEncoder`
			`from sklearn.impute import SimpleImputer`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler`
			`from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score`
			`import seaborn as sns`
			`import matplotlib.pyplot as plt`
			`from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score`
			`from sklearn.exceptions import ConvergenceWarning, DataConversionWarning`

			`import pickle`
			`import warnings`


			`def load_train_test(type_of_activity):`
			`BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"`
			`File_path_train = BUCKET + "/Train_set.csv"`
			`File_path_test = BUCKET + "/Test_set.csv"`

			`with fs.open( File_path_train, mode="rb") as file_in:`
			`dataset_train = pd.read_csv(file_in, sep=",")`
			`# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)`

			`with fs.open(File_path_test, mode="rb") as file_in:`
			`dataset_test = pd.read_csv(file_in, sep=",")`
			`# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)`

			`return dataset_train, dataset_test`


			`def save_file_s3(File_name, type_of_activity, model):`
			`image_buffer = io.BytesIO()`
			`plt.savefig(image_buffer, format='png')`
			`image_buffer.seek(0)`
			`FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"`
			`FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'`
			`with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:`
			`s3_file.write(image_buffer.read())`
			`plt.close()`


			`def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):`
			`if model_path:`
save model to pickle 2024-03-20 13:06:47 +01:00			`FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'`
utils_ml 2024-03-18 17:22:29 +01:00			`else:`
			`FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'`
			`with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:`
			`result_set.to_csv(file_out, index = False)`


save model to pickle 2024-03-20 13:06:47 +01:00			`def save_model_s3(File_name, type_of_activity, model, classifier):`
			`model_bytes = pickle.dumps(classifier)`
			`FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'`
			`with fs.open(FILE_PATH_OUT_S3, 'wb') as f:`
			`f.write(model_bytes)`


utils_ml 2024-03-18 17:22:29 +01:00			`def compute_recall(group):`
			`return recall_score(group['y_has_purchased'], group['prediction'])`


			`def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):`
			`test = dataset_test.copy()`
			`test['prediction'] = y_pred`
			`test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]`
			`recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')`
			`save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)`


			`def features_target_split(dataset_train, dataset_test):`
			`features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',`
			`'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',`
			`'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']`
			`X_train = dataset_train[features_l]`
			`y_train = dataset_train[['y_has_purchased']]`

			`X_test = dataset_test[features_l]`
			`y_test = dataset_test[['y_has_purchased']]`
			`return X_train, X_test, y_train, y_test`


			`def draw_confusion_matrix(y_test, y_pred, model):`
			`conf_matrix = confusion_matrix(y_test, y_pred)`
			`sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])`
			`plt.xlabel('Predicted')`
			`plt.ylabel('Actual')`
			`plt.title('Confusion Matrix')`
			`plt.show()`
			`save_file_s3("Confusion_matrix_", type_of_activity, model)`


			`def draw_roc_curve(X_test, y_pred_prob, model):`
			`# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)`
			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)`

			`# Calcul de l'aire sous la courbe ROC (AUC)`
			`roc_auc = auc(fpr, tpr)`

			`plt.figure(figsize = (14, 8))`
			`plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)`
			`plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")`
			`plt.grid(color='gray', linestyle='--', linewidth=0.5)`
			`plt.xlabel("False Positive Rate")`
			`plt.ylabel("True Positive Rate")`
			`plt.title("ROC Curve", size=18)`
			`plt.legend(loc="lower right")`
			`plt.show()`
			`save_file_s3("Roc_curve_", type_of_activity, model)`


			`def draw_calibration_curve(X_test, y_pred_prob, model):`
			`frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)`

			`# Plot the calibration curve`
			`plt.plot(mean_pred, frac_pos, 's-', label=model)`
			`plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')`
			`plt.xlabel('Mean predicted value')`
			`plt.ylabel('Fraction of positive predictions')`
			`plt.title("Calibration Curve")`
			`plt.legend()`
			`plt.show()`
			`save_file_s3("Calib_curve_", type_of_activity, model)`


			`def draw_features_importance(pipeline, model, randomF = False):`
			`if randomF:`
			`coefficients = pipeline.named_steps[model].feature_importances_`
			`else:`
			`coefficients = pipeline.named_steps[model].coef_[0]`

			`feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()`
			`# Tracer l'importance des caractéristiques`
			`plt.figure(figsize=(10, 6))`
			`plt.barh(feature_names, coefficients, color='skyblue')`
			`plt.xlabel("Features' Importance")`
			`plt.ylabel('Caractéristiques')`
			`plt.title("Features' Importance")`
			`plt.grid(True)`
			`plt.show()`
			`save_file_s3("Features_", type_of_activity, model)`


			`def draw_prob_distribution(y_pred_prob, model):`
			`plt.figure(figsize=(8, 6))`
			`plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)`

			`plt.xlim(0, 1)`
			`plt.ylim(0, None)`

			`plt.title('Histogramme des probabilités pour la classe 1')`
			`plt.xlabel('Probability')`
			`plt.ylabel('Frequency')`
			`plt.grid(True)`
			`plt.show()`
			`save_file_s3("prob_dist_", type_of_activity, model)`


			`def draw_prob_distribution_companies(y_pred_prob, model):`
			`test = dataset_test.copy()`
			`test['probability to buy'] = y_pred_prob`
			`test['company'] = test['customer_id'].str.split('_', expand=True)[0]`
add steps 2024-03-18 20:38:01 +01:00			`sns.histplot(data=test, x='probability to buy', hue='company', element='step',`
utils_ml 2024-03-18 17:22:29 +01:00			`stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)`
			`plt.xlim(0, 1)`
			`plt.ylim(0, None)`
			`plt.title('Histogram of probabilities for class 1 by company')`
			`plt.xlabel('Probability')`
			`plt.ylabel('Frequency')`
			`plt.grid(True)`
			`plt.show()`
			`save_file_s3("prob_dist_companies_", type_of_activity, model)`



			`def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):`
			`pipeline = Pipeline(steps=[`
			`('preprocessor', preproc),`
			`('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,`
			`max_iter=5000, n_jobs=-1))`
			`])`
			`pipeline.fit(X_train, y_train)`

			`y_pred = pipeline.predict(X_test)`
			`y_pred_prob = pipeline.predict_proba(X_test)[:, 1]`

			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)`
			`model = "LogisticRegression_Benchmark"`
			`result = pd.DataFrame({"Model" : [model],`
			`"Accuracy" : [accuracy_score(y_test, y_pred)],`
			`"Recall" : [recall_score(y_test, y_pred)],`
			`"F1_score" : [f1_score(y_test, y_pred, average="macro")],`
			`"AUC" : [auc(fpr, tpr)]}`
			`)`
			`model_result = pd.concat([model_result, result])`
add steps 2024-03-18 20:38:01 +01:00			`#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)`
utils_ml 2024-03-18 17:22:29 +01:00
			`draw_confusion_matrix(y_test, y_pred, model)`
			`draw_roc_curve(X_test, y_pred_prob, model)`
			`draw_features_importance(pipeline, 'LogisticRegression_Benchmark')`
			`draw_prob_distribution(y_pred_prob, model)`
			`draw_prob_distribution_companies(y_pred_prob, model)`
			`draw_calibration_curve(X_test, y_pred_prob, model)`
save model to pickle 2024-03-20 13:06:47 +01:00			`save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)`
utils_ml 2024-03-18 17:22:29 +01:00			`return model_result`


			`def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):`
			`y_train = y_train['y_has_purchased']`
			`param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),`
			`'LogisticRegression_cv__penalty': ['l1', 'l2'],`
			`'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}`
			`pipeline = Pipeline(steps=[`
			`('preprocessor', preproc),`
			`('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))`
			`])`
			`grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',`
			`n_jobs=-1)`

			`grid_search.fit(X_train, y_train)`
			`y_pred = grid_search.predict(X_test)`
			`y_pred_prob = grid_search.predict_proba(X_test)[:, 1]`
			`best_pipeline = grid_search.best_estimator_`
			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)`
			`model = "LogisticRegression_cv"`
			`result = pd.DataFrame({"Model" : [model],`
			`"Accuracy" : [accuracy_score(y_test, y_pred)],`
			`"Recall" : [recall_score(y_test, y_pred)],`
			`"F1_score" : [f1_score(y_test, y_pred, average="macro")],`
			`"AUC" : [auc(fpr, tpr)]}`
			`)`
			`model_result = pd.concat([model_result, result])`
add steps 2024-03-18 20:38:01 +01:00			`#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)`
utils_ml 2024-03-18 17:22:29 +01:00
			`draw_confusion_matrix(y_test, y_pred, model)`
			`draw_roc_curve(X_test, y_pred_prob, model)`
			`draw_features_importance(best_pipeline, 'LogisticRegression_cv')`
			`draw_prob_distribution(y_pred_prob, model)`
			`draw_prob_distribution_companies(y_pred_prob, model)`
			`draw_calibration_curve(X_test, y_pred_prob, model)`
save model to pickle 2024-03-20 13:06:47 +01:00			`save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)`
utils_ml 2024-03-18 17:22:29 +01:00			`return model_result`


			`def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):`
			`pipeline = Pipeline(steps=[`
			`('preprocessor', preproc),`
			`('randomF', RandomForestClassifier(class_weight = weight_dict,`
			`n_jobs=-1))`
			`])`
			`pipeline.fit(X_train, y_train)`

			`y_pred = pipeline.predict(X_test)`
			`y_pred_prob = pipeline.predict_proba(X_test)[:, 1]`

			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)`
			`model = "randomF"`
			`result = pd.DataFrame({"Model" : [model],`
			`"Accuracy" : [accuracy_score(y_test, y_pred)],`
			`"Recall" : [recall_score(y_test, y_pred)],`
			`"F1_score" : [f1_score(y_test, y_pred, average="macro")],`
			`"AUC" : [auc(fpr, tpr)]}`
			`)`
			`model_result = pd.concat([model_result, result])`
add steps 2024-03-18 20:38:01 +01:00			`#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)`
utils_ml 2024-03-18 17:22:29 +01:00
			`draw_confusion_matrix(y_test, y_pred, model)`
			`draw_roc_curve(X_test, y_pred_prob, model)`
			`draw_features_importance(pipeline, 'randomF', randomF=True)`
			`draw_prob_distribution(y_pred_prob, model)`
			`draw_prob_distribution_companies(y_pred_prob, model)`
			`draw_calibration_curve(X_test, y_pred_prob, model)`
save model to pickle 2024-03-20 13:06:47 +01:00			`save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)`
utils_ml 2024-03-18 17:22:29 +01:00			`return model_result`


			`def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):`
			`y_train = y_train['y_has_purchased']`
			`param_grid = {`
reduce random forest param grid 2024-03-19 12:43:44 +01:00			`'randomF_cv__n_estimators': [100, 300],`
			`'randomF_cv__max_features': ['sqrt', 'log2'],`
			`'randomF_cv__min_samples_split': [2, 10],`
			`'randomF_cv__min_samples_leaf': [1, 4],`
			`'randomF_cv__class_weight': [weight_dict]`
utils_ml 2024-03-18 17:22:29 +01:00			`}`
			`pipeline = Pipeline(steps=[`
			`('preprocessor', preproc),`
			`('randomF_cv', RandomForestClassifier(n_jobs=-1))`
			`])`
			`grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',`
			`n_jobs=-1)`

			`grid_search.fit(X_train, y_train)`
			`y_pred = grid_search.predict(X_test)`
			`y_pred_prob = grid_search.predict_proba(X_test)[:, 1]`
			`best_pipeline = grid_search.best_estimator_`
			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)`
			`model = "randomF_cv"`
			`result = pd.DataFrame({"Model" : [model],`
			`"Accuracy" : [accuracy_score(y_test, y_pred)],`
			`"Recall" : [recall_score(y_test, y_pred)],`
			`"F1_score" : [f1_score(y_test, y_pred, average="macro")],`
			`"AUC" : [auc(fpr, tpr)]}`
			`)`
			`model_result = pd.concat([model_result, result])`
add steps 2024-03-18 20:38:01 +01:00			`#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)`
utils_ml 2024-03-18 17:22:29 +01:00
			`draw_confusion_matrix(y_test, y_pred, model)`
			`draw_roc_curve(X_test, y_pred_prob, model)`
			`draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)`
			`draw_prob_distribution(y_pred_prob, model)`
			`draw_prob_distribution_companies(y_pred_prob, model)`
			`draw_calibration_curve(X_test, y_pred_prob, model)`
save model to pickle 2024-03-20 13:06:47 +01:00			`save_model_s3('randomF_cv', type_of_activity, model, gridsearch)`
utils_ml 2024-03-18 17:22:29 +01:00			`return model_result`


			`def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):`
			`unique_classes, counts = np.unique(y_train, return_counts=True)`
			`class_priors = counts / counts.sum()`
			`pipeline = Pipeline(steps=[`
			`('preprocessor', preproc),`
			`('Naive_Bayes', GaussianNB(priors=class_priors))`
			`])`
			`pipeline.fit(X_train, y_train)`

			`y_pred = pipeline.predict(X_test)`
			`y_pred_prob = pipeline.predict_proba(X_test)[:, 1]`

			`fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)`
			`model = "Naive_Bayes"`
			`result = pd.DataFrame({"Model" : [model],`
			`"Accuracy" : [accuracy_score(y_test, y_pred)],`
			`"Recall" : [recall_score(y_test, y_pred)],`
			`"F1_score" : [f1_score(y_test, y_pred, average="macro")],`
			`"AUC" : [auc(fpr, tpr)]}`
			`)`
			`model_result = pd.concat([model_result, result])`
			`draw_confusion_matrix(y_test, y_pred, model)`
			`draw_roc_curve(X_test, y_pred_prob, model)`
			`draw_prob_distribution(y_pred_prob, model)`
			`draw_calibration_curve(X_test, y_pred_prob, model)`
save model to pickle 2024-03-20 13:06:47 +01:00			`save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)`
utils_ml 2024-03-18 17:22:29 +01:00			`return model_result`