# Baseline logit on spectacle companies with statmodels

In [1]:
import pandas as pd
import numpy as np
import os
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning

import statsmodels.api as sm

import pickle
import warnings

In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [9]:
def load_train_test():
    BUCKET = "projet-bdc2324-team1/Generalization/musique"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)

    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    
    return dataset_train, dataset_test

In [4]:
def features_target_split(dataset_train, dataset_test):
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
    X_train = dataset_train[features_l]
    y_train = dataset_train[['y_has_purchased']]

    X_test = dataset_test[features_l]
    y_test = dataset_test[['y_has_purchased']]
    return X_train, X_test, y_train, y_test

In [5]:
dataset_train, dataset_test = load_train_test()

  dataset_train = pd.read_csv(file_in, sep=",")
  dataset_test = pd.read_csv(file_in, sep=",")


In [6]:
dataset_train.isna().sum()

customer_id                    0
nb_tickets                     0
nb_purchases                   0
total_amount                   0
nb_suppliers                   0
vente_internet_max             0
purchase_date_min              0
purchase_date_max              0
time_between_purchase          0
nb_tickets_internet            0
street_id                      0
structure_id              327067
mcp_contact_id            135224
fidelity                       0
tenant_id                      0
is_partner                     0
deleted_at                354365
gender                         0
is_email_true                  0
opt_in                         0
last_buying_date          119201
max_price                 119201
ticket_sum                     0
average_price             115193
average_purchase_delay    119203
average_price_basket      119203
average_ticket_basket     119203
total_price                 4008
purchase_count                 0
first_buying_date         119201
country   

In [7]:
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)

In [8]:
print("Shape train : ", X_train.shape)
print("Shape test : ", X_test.shape)

Shape train :  (354365, 17)
Shape test :  (151874, 17)


In [10]:
# Compute Weights
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
                                            y = y_train['y_has_purchased'])

weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
weight_dict

{0.0: 0.5481283836040216, 1.0: 5.694439980716696}

In [58]:
# Calcul des poids inverses à la fréquence des classes
class_counts = np.bincount(y_train['y_has_purchased'])
class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)

# Sélection des poids correspondants à chaque observation
weights = class_weights[y_train['y_has_purchased'].values.astype(int)]
weights

array([0.54812838, 0.54812838, 0.54812838, ..., 5.69443998, 0.54812838,
       0.54812838])

In [65]:
print(2 * weights * class_counts[y_train['y_has_purchased'].values.astype(int)])
print(len(y_train['y_has_purchased']))

[354365. 354365. 354365. ... 354365. 354365. 354365.]
354365


In [124]:
# model logit
X = X_train.astype(int)
# X = sm.add_constant(X.drop("gender_other", axis=1))
y = y_train['y_has_purchased'].values

# print(X,y)

In [125]:
X

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,gender_other,nb_campaigns,nb_campaigns_opened
0,0,0,0,0,0,550,550,-1,0,1,1,1,1,0,0,13,4
1,0,0,0,0,0,550,550,-1,0,0,1,1,0,0,1,10,9
2,0,0,0,0,0,550,550,-1,0,1,1,1,0,1,0,14,0
3,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,9,0
4,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,7,0
354361,0,0,0,0,0,550,550,-1,0,0,1,1,0,1,0,11,2
354362,2,2,50,1,0,91,91,0,0,4,1,0,1,0,0,6,6
354363,1,1,55,1,0,52,52,0,0,1,1,1,0,1,0,3,0


In [138]:
X_train[["purchase_date_min", "purchase_date_max"]].describe()

Unnamed: 0,purchase_date_min,purchase_date_max
count,354365.0,354365.0
mean,406.981861,396.551502
std,189.343612,195.881681
min,0.00964,0.0
25%,188.475293,153.457966
50%,550.0,550.0
75%,550.0,550.0
max,550.0,550.0


In [143]:
(X_train["purchase_date_min"] - X_train["purchase_date_max"]).describe()

count    354365.000000
mean         10.430360
std          56.442718
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         547.443350
dtype: float64

In [145]:
X_train[X_train["time_between_purchase"]==-1]

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,gender_other,nb_campaigns,nb_campaigns_opened
0,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,1,True,True,1,0,0,13.0,4.0
1,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,True,0,0,1,10.0,9.0
2,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,1,True,True,0,1,0,14.0,0.0
3,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,False,0,0,1,9.0,0.0
4,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,False,0,0,1,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354358,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,False,1,0,0,1.0,0.0
354359,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,True,0,1,0,12.0,2.0
354360,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,False,0,0,1,7.0,0.0
354361,0.0,0.0,0.0,0.0,0.0,550.0,550.0,-1.0,0.0,0,True,True,0,1,0,11.0,2.0


In [126]:
# Colonnes à standardiser

"""
var_num = ['nb_tickets', 'nb_purchases', "total_amount", "nb_suppliers", "vente_internet_max",
                          "purchase_date_min", "purchase_date_max", "time_between_purchase", "nb_tickets_internet",
                          "fidelity", "nb_campaigns", "nb_campaigns_opened"]
                          """

numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']

# Standardisation des colonnes sélectionnées
scaler = StandardScaler()
X[var_num] = scaler.fit_transform(X[var_num])
X[numeric_features] = scaler.fit_transform(X[numeric_features])



In [128]:
X

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,gender_other,nb_campaigns,nb_campaigns_opened
0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.004316,0.058193,1.151186,1.071079,-0.775306,-0.434568,0.607945,0.522567
1,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,0.058193,1.151186,-0.933638,-0.775306,2.301137,0.306155,1.701843
2,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.004316,0.058193,1.151186,-0.933638,1.289813,-0.434568,0.708542,-0.420854
3,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,0.058193,-0.868669,-0.933638,-0.775306,2.301137,0.205558,-0.420854
4,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,0.058193,-0.868669,-0.933638,-0.775306,2.301137,-0.297426,-0.420854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,0.058193,-0.868669,-0.933638,-0.775306,2.301137,0.004365,-0.420854
354361,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,0.058193,1.151186,-0.933638,1.289813,-0.434568,0.406752,0.050856
354362,-0.000838,0.092966,-0.009150,1.219633,-0.599511,-1.665887,-1.557073,-0.175269,-0.264693,0.069949,0.058193,-0.868669,1.071079,-0.775306,-0.434568,-0.096232,0.994277
354363,-0.012631,0.021122,-0.005227,1.219633,-0.599511,-1.871668,-1.755983,-0.175269,-0.264693,-0.004316,0.058193,1.151186,-0.933638,1.289813,-0.434568,-0.398023,-0.420854


In [122]:
X

Unnamed: 0,const,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,nb_campaigns,nb_campaigns_opened
0,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.004316,1,1,1,0,0.607945,0.522567
1,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,1,1,0,0,0.306155,1.701843
2,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.004316,1,1,0,1,0.708542,-0.420854
3,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,1,0,0,0,0.205558,-0.420854
4,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,1,0,0,0,-0.297426,-0.420854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,1,0,0,0,0.004365,-0.420854
354361,1.0,-0.024425,-0.050722,-0.048383,-0.768294,-0.599511,0.755994,0.783940,-0.192978,-0.264693,-0.029071,1,1,0,1,0.406752,0.050856
354362,1.0,-0.000838,0.092966,-0.009150,1.219633,-0.599511,-1.665887,-1.557073,-0.175269,-0.264693,0.069949,1,0,1,0,-0.096232,0.994277
354363,1.0,-0.012631,0.021122,-0.005227,1.219633,-0.599511,-1.871668,-1.755983,-0.175269,-0.264693,-0.004316,1,1,0,1,-0.398023,-0.420854


In [133]:
# Création du modèle de régression logistique avec poids équilibrés
# model_logit = sm.Logit(y, X, weights=weights)
model_logit = sm.Logit(y, X)

# Ajustement du modèle aux données
result = model_logit.fit()

# Affichage des résultats
print(result.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


         Current function value: inf
         Iterations: 35




                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               354365
Model:                          Logit   Df Residuals:                   354349
Method:                           MLE   Df Model:                           15
Date:                Thu, 14 Mar 2024   Pseudo R-squ.:                    -inf
Time:                        10:47:16   Log-Likelihood:                   -inf
converged:                      False   LL-Null:                   -1.0540e+05
Covariance Type:            nonrobust   LLR p-value:                     1.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
nb_tickets                4.9213      0.267     18.448      0.000       4.398       5.444
nb_purchases             -7.9446      0.140    -56.905      0.000      -8.218      -7.671
total_am

In [130]:
X["gender_female"].unique()

array([ 1.07107945, -0.93363755])

In [131]:
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']

numeric_transformer = Pipeline(steps=[
    #("imputer", SimpleImputer(strategy="mean")),  
    ("scaler", StandardScaler()) 
])

categorical_features = ['opt_in']  

# Transformer for the categorical features
categorical_transformer = Pipeline(steps=[
    #("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preproc = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [105]:
sm.add_constant(X["gender_other"])

Unnamed: 0,const,gender_other
0,1.0,0
1,1.0,1
2,1.0,0
3,1.0,1
4,1.0,1
...,...,...
354360,1.0,1
354361,1.0,0
354362,1.0,0
354363,1.0,0


In [106]:
X.drop("gender_other", axis=1)

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,nb_campaigns,nb_campaigns_opened
0,0,0,0,0,0,550,550,-1,0,1,1,1,1,0,13,4
1,0,0,0,0,0,550,550,-1,0,0,1,1,0,0,10,9
2,0,0,0,0,0,550,550,-1,0,1,1,1,0,1,14,0
3,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,9,0
4,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,7,0
354361,0,0,0,0,0,550,550,-1,0,0,1,1,0,1,11,2
354362,2,2,50,1,0,91,91,0,0,4,1,0,1,0,6,6
354363,1,1,55,1,0,52,52,0,0,1,1,1,0,1,3,0


In [107]:
X

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,gender_other,nb_campaigns,nb_campaigns_opened
0,0,0,0,0,0,550,550,-1,0,1,1,1,1,0,0,13,4
1,0,0,0,0,0,550,550,-1,0,0,1,1,0,0,1,10,9
2,0,0,0,0,0,550,550,-1,0,1,1,1,0,1,0,14,0
3,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,9,0
4,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,0,0,0,0,0,550,550,-1,0,0,1,0,0,0,1,7,0
354361,0,0,0,0,0,550,550,-1,0,0,1,1,0,1,0,11,2
354362,2,2,50,1,0,91,91,0,0,4,1,0,1,0,0,6,6
354363,1,1,55,1,0,52,52,0,0,1,1,1,0,1,0,3,0


In [73]:
weights.shape

(354365,)

In [74]:
X.shape

(354365, 17)

In [75]:
y.shape

(354365,)

In [76]:
X_train.isna().sum()

nb_tickets               0
nb_purchases             0
total_amount             0
nb_suppliers             0
vente_internet_max       0
purchase_date_min        0
purchase_date_max        0
time_between_purchase    0
nb_tickets_internet      0
fidelity                 0
is_email_true            0
opt_in                   0
gender_female            0
gender_male              0
gender_other             0
nb_campaigns             0
nb_campaigns_opened      0
dtype: int64

In [80]:
y_train["y_has_purchased"].unique()

array([0., 1.])

In [134]:
X_train

Unnamed: 0,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,fidelity,is_email_true,opt_in,gender_female,gender_male,gender_other,nb_campaigns,nb_campaigns_opened
0,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,1,True,True,1,0,0,13.0,4.0
1,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,0,True,True,0,0,1,10.0,9.0
2,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,1,True,True,0,1,0,14.0,0.0
3,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,0,True,False,0,0,1,9.0,0.0
4,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,0,True,False,0,0,1,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354360,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,0,True,False,0,0,1,7.0,0.0
354361,0.0,0.0,0.0,0.0,0.0,550.000000,550.000000,-1.000000,0.0,0,True,True,0,1,0,11.0,2.0
354362,2.0,2.0,50.0,1.0,0.0,91.030556,91.020139,0.010417,0.0,4,True,False,1,0,0,6.0,6.0
354363,1.0,1.0,55.0,1.0,0.0,52.284028,52.284028,0.000000,0.0,1,True,True,0,1,0,3.0,0.0
