From 969cb8ec43bb0119e4b59082b05c49d30d8c8bed Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Mon, 18 Mar 2024 16:23:52 +0000 Subject: [PATCH] add machine learning automatisation --- 0_5_Machine_Learning.py | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 0_5_Machine_Learning.py diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py new file mode 100644 index 0000000..f0828c4 --- /dev/null +++ b/0_5_Machine_Learning.py @@ -0,0 +1,97 @@ +import pandas as pd +import numpy as np +import os +import io +import s3fs +import re +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score +from sklearn.utils import class_weight +from sklearn.neighbors import KNeighborsClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.calibration import calibration_curve +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler +from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score +from sklearn.exceptions import ConvergenceWarning, DataConversionWarning +import pickle +import warnings + + +exec(open('utils_ml.py').read()) + +warnings.filterwarnings('ignore') +warnings.filterwarnings("ignore", category=ConvergenceWarning) +warnings.filterwarnings("ignore", category=DataConversionWarning) + +# choose the type of companies for which you want to run the pipeline +type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') + +# load train and test set +# Create filesystem object +S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] +fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) + +dataset_train, dataset_test = load_train_test(type_of_activity ) + +X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test) + +print("Shape train : ", X_train.shape) +print("Shape test : ", X_test.shape) + +# processing + +weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']), + y = y_train['y_has_purchased']) + +weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} + + +numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', + 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', + 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] + +numeric_transformer = Pipeline(steps=[ + #("imputer", SimpleImputer(strategy="mean")), + ("scaler", StandardScaler()) +]) + +categorical_features = ['opt_in'] + +# Transformer for the categorical features +categorical_transformer = Pipeline(steps=[ + #("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent + ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) +]) + +preproc = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features) + ] +) + +# Object for storing results +model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) + +# Naive Bayes +model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result) + +# Logistic Regression +model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result) +model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result) + +# Random Forest +model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result) +model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result) + +# Save result +save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file