From 969cb8ec43bb0119e4b59082b05c49d30d8c8bed Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Mon, 18 Mar 2024 16:23:52 +0000
Subject: [PATCH] add machine learning automatisation

---
 0_5_Machine_Learning.py | 97 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 0_5_Machine_Learning.py

diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py
new file mode 100644
index 0000000..f0828c4
--- /dev/null
+++ b/0_5_Machine_Learning.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import numpy as np
+import os
+import io
+import s3fs
+import re
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
+from sklearn.utils import class_weight
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.calibration import calibration_curve
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
+from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
+import pickle
+import warnings
+
+
+exec(open('utils_ml.py').read())
+
+warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
+warnings.filterwarnings("ignore", category=DataConversionWarning)
+
+# choose the type of companies for which you want to run the pipeline
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+
+# load train and test set
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+dataset_train, dataset_test = load_train_test(type_of_activity )
+
+X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
+
+print("Shape train : ", X_train.shape)
+print("Shape test : ", X_test.shape)
+
+# processing
+
+weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
+                                            y = y_train['y_has_purchased'])
+
+weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
+
+
+numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
+            'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
+            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
+
+numeric_transformer = Pipeline(steps=[
+    #("imputer", SimpleImputer(strategy="mean")),  
+    ("scaler", StandardScaler()) 
+])
+
+categorical_features = ['opt_in']  
+
+# Transformer for the categorical features
+categorical_transformer = Pipeline(steps=[
+    #("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent
+    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+])
+
+preproc = ColumnTransformer(
+    transformers=[
+        ("num", numeric_transformer, numeric_features),
+        ("cat", categorical_transformer, categorical_features)
+    ]
+)
+
+# Object for storing results
+model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
+
+# Naive Bayes
+model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
+
+# Logistic Regression
+model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
+model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
+
+# Random Forest
+model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
+model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
+
+# Save result
+save_result_set_s3(model_result , "resultat", type_of_activity)
\ No newline at end of file