import pandas as pd import numpy as np import os import io import s3fs import re from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score from sklearn.utils import class_weight from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.calibration import calibration_curve from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score from sklearn.exceptions import ConvergenceWarning, DataConversionWarning import pickle import warnings exec(open('utils_ml.py').read()) warnings.filterwarnings('ignore') warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=DataConversionWarning) # choose the type of companies for which you want to run the pipeline type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') # load train and test set # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) dataset_train, dataset_test = load_train_test(type_of_activity ) X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test) print("Shape train : ", X_train.shape) print("Shape test : ", X_test.shape) # processing weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']), y = y_train['y_has_purchased']) weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened'] numeric_transformer = Pipeline(steps=[ #("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler()) ]) categorical_features = ['opt_in', 'gender_male', 'gender_female'] # Transformer for the categorical features categorical_transformer = Pipeline(steps=[ #("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ]) preproc = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features) ] ) # Object for storing results model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) # Naive Bayes model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result) print("Naive Bayes : Done") # Logistic Regression model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) print("Logistic : Done") model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) print("Logistic CV : Done") # Random Forest model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result) print("Random Forest : Done") model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) print("Random Forest CV: Done") # Save result save_result_set_s3(model_result , "resultat", type_of_activity)