BDC-team-1/5_Modelling.py

import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
import pickle
import warnings


exec(open('utils_ml.py').read())

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# choose the type of model
type_of_model = input('Choisissez le type de model : standard ? premium ?')

# load train and test set
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)

X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)

print("Shape train : ", X_train.shape)
print("Shape test : ", X_test.shape)

# processing

weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
                                            y = y_train['y_has_purchased'])

weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}

preproc = preprocess(type_of_model, type_of_activity)

# Object for storing results
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])

# Naive Bayes
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Naive Bayes : Done")

# Logistic Regression
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
print("Logistic : Done")

model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Logistic CV : Done")

# Random Forest
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest : Done")

model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest CV: Done")

# Save result
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)