In [1]:
import pandas as pd
import numpy as np
import os
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning

import statsmodels.api as sm

import pickle
import warnings

In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [5]:
def load_train_test():
    BUCKET = "projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/musee"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)

    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    
    return dataset_train, dataset_test


def features_target_split(dataset_train, dataset_test):
    features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
                  'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
                 'target_jeune', 'target_abonne']
    X_train = dataset_train[features_l]
    y_train = dataset_train[['y_has_purchased']]

    X_test = dataset_test[features_l]
    y_test = dataset_test[['y_has_purchased']]
    return X_train, X_test, y_train, y_test

In [6]:
dataset_train, dataset_test = load_train_test()

  dataset_train = pd.read_csv(file_in, sep=",")
  dataset_test = pd.read_csv(file_in, sep=",")


In [7]:
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)

In [22]:
X_train.isna().sum()

const                      0
nb_campaigns               0
taux_ouverture_mail        0
prop_purchases_internet    0
nb_tickets                 0
nb_purchases               0
total_amount               0
nb_suppliers               0
time_to_open               0
purchases_10_2021          0
purchases_10_2022          0
purchases_11_2021          0
purchases_12_2021          0
purchases_1_2022           0
purchases_2_2022           0
purchases_3_2022           0
purchases_4_2022           0
purchases_5_2021           0
purchases_5_2022           0
purchases_6_2021           0
purchases_6_2022           0
purchases_7_2021           0
purchases_7_2022           0
purchases_8_2021           0
purchases_8_2022           0
purchases_9_2021           0
purchases_9_2022           0
purchase_date_min          0
purchase_date_max          0
nb_targets                 0
gender_female              0
gender_male                0
achat_internet             0
categorie_age_0_10         0
categorie_age_

In [17]:
most_frequent_value = X_train['country_fr'].mode()[0]
most_frequent_value

1.0

In [21]:
X_train['country_fr'] = X_train['country_fr'].fillna(most_frequent_value)
X_train['time_to_open'] = X_train['time_to_open'].fillna(0)

In [8]:
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
                                            y = y_train['y_has_purchased'])

weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}

In [9]:

class_counts = np.bincount(y_train['y_has_purchased'])
class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)

weights = class_weights[y_train['y_has_purchased'].values.astype(int)]
weights

array([0.52239696, 0.52239696, 0.52239696, ..., 0.52239696, 0.52239696,
       0.52239696])

In [13]:
X_train = sm.add_constant(X_train)

In [26]:
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']

In [27]:
scaler = StandardScaler()

X_train_scaled_columns = scaler.fit_transform(X_train[numeric_features])

X_train_scaled = X_train.copy()  #
X_train_scaled[numeric_features] = X_train_scaled_columns

In [28]:
model_logit = sm.Logit(y_train, X_train_scaled)

result = model_logit.fit(weights=weights)

print(result.summary())



Optimization terminated successfully.
         Current function value: 0.136180
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:        y_has_purchased   No. Observations:               434278
Model:                          Logit   Df Residuals:                   434226
Method:                           MLE   Df Model:                           51
Date:                Thu, 04 Apr 2024   Pseudo R-squ.:                  0.2305
Time:                        06:09:09   Log-Likelihood:                -59140.
converged:                       True   LL-Null:                       -76855.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -4.0679   1.65e+06  -2.46e-06      1.000   -3.24e+06   