103 lines
3.9 KiB
Python
103 lines
3.9 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import os
|
|
import io
|
|
import s3fs
|
|
import re
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
|
from sklearn.utils import class_weight
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.naive_bayes import GaussianNB
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.calibration import calibration_curve
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
|
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
|
import pickle
|
|
import warnings
|
|
|
|
|
|
exec(open('utils_ml.py').read())
|
|
|
|
warnings.filterwarnings('ignore')
|
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
warnings.filterwarnings("ignore", category=DataConversionWarning)
|
|
|
|
# choose the type of companies for which you want to run the pipeline
|
|
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
|
|
|
# load train and test set
|
|
# Create filesystem object
|
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
|
|
|
dataset_train, dataset_test = load_train_test(type_of_activity )
|
|
|
|
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
|
|
|
|
print("Shape train : ", X_train.shape)
|
|
print("Shape test : ", X_test.shape)
|
|
|
|
# processing
|
|
|
|
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
|
|
y = y_train['y_has_purchased'])
|
|
|
|
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
|
|
|
|
|
|
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
|
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
|
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
|
|
|
numeric_transformer = Pipeline(steps=[
|
|
#("imputer", SimpleImputer(strategy="mean")),
|
|
("scaler", StandardScaler())
|
|
])
|
|
|
|
categorical_features = ['opt_in']
|
|
|
|
# Transformer for the categorical features
|
|
categorical_transformer = Pipeline(steps=[
|
|
#("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent
|
|
("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
|
])
|
|
|
|
preproc = ColumnTransformer(
|
|
transformers=[
|
|
("num", numeric_transformer, numeric_features),
|
|
("cat", categorical_transformer, categorical_features)
|
|
]
|
|
)
|
|
|
|
# Object for storing results
|
|
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
|
|
|
|
# Naive Bayes
|
|
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
|
|
print("Naive Bayes : Done")
|
|
|
|
# Logistic Regression
|
|
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
|
|
print("Logistic : Done")
|
|
"""
|
|
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
|
|
print("Logistic CV : Done")
|
|
|
|
# Random Forest
|
|
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
|
|
print("Random Forest : Done")
|
|
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
|
|
print("Random Forest CV: Done")
|
|
"""
|
|
# Save result
|
|
save_result_set_s3(model_result , "resultat", type_of_activity) |