Merge branch 'main' into generalization

2024-03-27 09:07:21 +00:00 · 2024-03-27 09:07:21 +00:00 · f4b6f23394
commit f4b6f23394
parent 52fd738fe5 38c3fc3148
8 changed files with 1059 additions and 12764 deletions
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -1,5 +1,8 @@
-# Business Data Challenge - Team 1
+# Purpose of the script : Construction of training and test datasets for modelling by company
 # Input : KPI construction function and clean databases in the 0_Input folder
 # Output : Train and test datasets by compagnies 
 # Packages
 import pandas as pd
 import numpy as np
 import os
@ -9,12 +12,10 @@ import warnings
 from datetime import date, timedelta, datetime
 from sklearn.model_selection import train_test_split
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # Import KPI construction functions
 exec(open('0_KPI_functions.py').read())
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
 def dataset_construction(min_date, end_features_date, max_date, directory_path):
-    # Import customerplus
+    # Import of cleaned and merged datasets
    df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
    df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
    df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
    df_target_information = display_input_databases(directory_path, file_name = "target_information")
-    # if directory_path == "101":
+    # Dates in datetime format
    #     df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
    #     df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
    # Filtre de cohérence pour la mise en pratique de notre méthode
    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
-    #Filtre de la base df_campaigns_information
+    # Filter for database df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
-    #Filtre de la base df_products_purchased_reduced
+    # Filter for database df_products_purchased_reduced
-    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
    print("Data filtering : SUCCESS")
-    # Fusion de l'ensemble et creation des KPI
+    # Building and merging features 
-    # KPI sur les campagnes publicitaires
+    # Campaigns features
    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date) 
-    # KPI sur le comportement d'achat
+    # Purchasing behavior features
    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
-    # KPI sur les données socio-démographiques
+    # Socio-demographic features
    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
    # Targets features
    df_targets_kpi = targets_KPI(df_target = df_target_information)
    print("KPIs construction : SUCCESS")
-    # Fusion avec KPI liés au customer
+    # Merge - campaigns features
    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+    df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
    df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
-    # Fusion avec KPI liés au comportement d'achat
+    # Merge - targets features
-    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
+    df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+    targets_columns = list(df_targets_kpi.columns)
    targets_columns.remove('customer_id')
    df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
    # We standardise the number of targets closely linked to the company's operations
    df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
    # Merge - purchasing behavior features
    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
    special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']    
    simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
    df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    print("Explanatory variable construction : SUCCESS")
    # 2. Construction of the explained variable 
-    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
-    # Indicatrice d'achat
+    # Construction of the dependant variable
    df_products_purchased_to_predict['y_has_purchased'] = 1
    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    return dataset
 ## Exportation
-
+# Sectors
 companies = {'musee' : ['1', '2', '3', '4'], # , '101'
            'sport': ['5', '6', '7', '8', '9'],
            'musique' : ['10', '11', '12', '13', '14']}
 # Choosed sector
 type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 list_of_comp = companies[type_of_comp] 
 # Dossier d'exportation
 BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
-# Create test dataset and train dataset for sport companies
+# Export folder
-
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
 #start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
 # start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
 # Dates used for the construction of features and the dependant variable
 start_date = "2021-05-01"
 end_of_features = "2022-11-01"
 final_date = "2023-11-01"
-
+# Anonymous customer to be deleted from the datasets
 anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
                      '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
                     '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
@ -133,33 +146,23 @@ for company in list_of_comp:
    dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
                                        max_date = final_date, directory_path = company)    
-    # On retire le client anonyme
+    # Deletion of the anonymous customer
    dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
-
+    # Split between train and test
    # #train test set
    # np.random.seed(42)
    # split_ratio = 0.7
    # split_index = int(len(dataset) * split_ratio)
    # dataset = dataset.sample(frac=1).reset_index(drop=True)
    # dataset_train = dataset.iloc[:split_index]
    # dataset_test = dataset.iloc[split_index:]
    dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
    # Dataset Test
-    # Exportation
+    # Export
    FILE_KEY_OUT_S3 = "dataset_test" + company +  ".csv"
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        dataset_test.to_csv(file_out, index = False)
-    print("Exportation dataset test : SUCCESS")
+    print("Export of dataset test : SUCCESS")
    # Dataset train
    # Export
    FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" 
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
@ -167,7 +170,7 @@ for company in list_of_comp:
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        dataset_train.to_csv(file_out, index = False)
-    print("Exportation dataset train : SUCCESS")
+    print("Export of dataset train : SUCCESS")
-print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
+print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
--- a/0_3_General_modelization_dataset.py
+++ b/0_3_General_modelization_dataset.py
@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')
 # functions
 def generate_test_set(type_of_comp):
-    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
    test_set = pd.DataFrame()
    for file in file_path_list:
        print(file)
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
 def generate_train_set(type_of_comp):
-    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
    train_set = pd.DataFrame()
    for file in file_path_list:
        print(file)
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
 type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
-BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'
 # create test and train datasets
 test_set = generate_test_set(type_of_comp)
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
    cleaning_date(customerplus_copy, 'last_visiting_date')
    # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
    return customerplus_copy
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
    return df       
-def campaigns_kpi_function(campaigns_information = None, max_date = None):
+def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
    # Temps d'ouverture moyen (en minutes)
-    campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
+    campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
    return campaigns_reduced
 def tickets_kpi_function(tickets_information = None):
    tickets_information_copy = tickets_information.copy()
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
    })
    gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
    customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
    customerplus_clean.drop(columns = "gender", inplace = True)
    # Age
    customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
    customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
    customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
    customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
    # customerplus_clean.drop(columns = "age", inplace = True)
    # Consentement au mailing
    customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
    # Indicatrice si individue vit en France
    customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
    # customerplus_clean.drop(columns = "country", inplace = True)
    customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
    # customerplus_clean.drop(columns = "profession", inplace = True)
    customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
    # customerplus_clean.drop(columns = "zipcode", inplace = True)
    # Dummy if the customer has a structure id (tags)
    # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
    return customerplus_clean
 def targets_KPI(df_target = None):
    df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
    # Target name cotegory musees / 
    df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
    df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
    df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
    df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
    df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
    df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
    df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
    # Target name category for sport compagnies
    df_target['target_abonne'] = ((
                            df_target['target_name']
                            .str.contains('|'.join(['abo', 'adh']), case=False)
                            & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
                            ).astype(int))
    df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
    target_agg = df_target.groupby('customer_id').agg(
        nb_targets=('target_name', 'nunique')  # Utilisation de tuples pour spécifier les noms de colonnes
        # all_targets=('target_name', concatenate_names),
        # all_target_types=('target_type_name', concatenate_names)
        ).reset_index()
    target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
    return target_agg
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
--- a/Musee/1_Descriptive_Statistics_Museum.ipynb
+++ b/Musee/1_Descriptive_Statistics_Museum.ipynb
--- a/Musee/2_Modelization_musee.ipynb
+++ b/Musee/2_Modelization_musee.ipynb
--- a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
+++ b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
   "id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
   "metadata": {},
   "outputs": [],
@ -115,9 +115,9 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_train = pd.read_csv(file_in, sep=\",\")\n",
-      "/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_test = pd.read_csv(file_in, sep=\",\")\n"
     ]
    }
@ -228,7 +228,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "6224fd31-c190-4168-b395-e0bf5806d79d",
   "metadata": {},
   "outputs": [
@ -238,7 +238,7 @@
       "{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -254,7 +254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 10,
   "id": "4680f202-979e-483f-89b8-9df877203bcf",
   "metadata": {},
   "outputs": [
@ -265,7 +265,7 @@
       "       0.54812838])"
      ]
     },
-     "execution_count": 58,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -282,7 +282,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 11,
   "id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
   "metadata": {},
   "outputs": [
@ -311,7 +311,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 258,
+   "execution_count": 12,
   "id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
   "metadata": {},
   "outputs": [
@ -650,7 +650,7 @@
       "[354365 rows x 17 columns]"
      ]
     },
-     "execution_count": 258,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -662,7 +662,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 259,
+   "execution_count": 13,
   "id": "648fb542-0186-493d-b274-be2c26a11967",
   "metadata": {},
   "outputs": [],
@ -677,7 +677,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 260,
+   "execution_count": 14,
   "id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
   "metadata": {},
   "outputs": [
@ -1016,7 +1016,7 @@
       "[354365 rows x 17 columns]"
      ]
     },
-     "execution_count": 260,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1510,12 +1510,14 @@
    "\n",
    "- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
    "- ajouter un intercept\n",
-    "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
+    "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
    "\n",
    "#### A recopier dans la pipeline -> section 2 bis"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 266,
+   "execution_count": 15,
   "id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
   "metadata": {},
   "outputs": [
@ -1817,7 +1819,7 @@
       "[354365 rows x 15 columns]"
      ]
     },
-     "execution_count": 266,
+     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1831,7 +1833,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 267,
+   "execution_count": 16,
   "id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
   "metadata": {},
   "outputs": [
@ -1847,8 +1849,8 @@
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                          Logit   Df Residuals:                   354350\n",
      "Method:                           MLE   Df Model:                           14\n",
-      "Date:                Fri, 15 Mar 2024   Pseudo R-squ.:                  0.2112\n",
+      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2112\n",
-      "Time:                        10:07:29   Log-Likelihood:                -83135.\n",
+      "Time:                        07:57:46   Log-Likelihood:                -83135.\n",
      "converged:                       True   LL-Null:                   -1.0540e+05\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
@ -1887,7 +1889,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 268,
+   "execution_count": 17,
   "id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
   "metadata": {},
   "outputs": [],
@ -1908,7 +1910,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 269,
+   "execution_count": 18,
   "id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
   "metadata": {},
   "outputs": [
@ -2210,7 +2212,7 @@
       "[354365 rows x 15 columns]"
      ]
     },
-     "execution_count": 269,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2221,7 +2223,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 289,
+   "execution_count": 19,
   "id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
   "metadata": {},
   "outputs": [
@ -2237,8 +2239,8 @@
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                          Logit   Df Residuals:                   354350\n",
      "Method:                           MLE   Df Model:                           14\n",
-      "Date:                Fri, 15 Mar 2024   Pseudo R-squ.:                  0.2112\n",
+      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2112\n",
-      "Time:                        10:26:14   Log-Likelihood:                -83135.\n",
+      "Time:                        07:58:13   Log-Likelihood:                -83135.\n",
      "converged:                       True   LL-Null:                   -1.0540e+05\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
@ -2276,12 +2278,226 @@
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully    (Exit mode 0)\n",
      "            Current function value: 0.23562928627877766\n",
      "            Iterations: 240\n",
      "            Function evaluations: 243\n",
      "            Gradient evaluations: 240\n",
      "const                   0.000000e+00\n",
      "nb_tickets              2.477006e-01\n",
      "nb_purchases            1.636902e-03\n",
      "total_amount            8.839088e-04\n",
      "nb_suppliers            1.906550e-65\n",
      "vente_internet_max      0.000000e+00\n",
      "purchase_date_min       0.000000e+00\n",
      "purchase_date_max       0.000000e+00\n",
      "nb_tickets_internet    7.232680e-112\n",
      "is_email_true           8.202187e-08\n",
      "opt_in                  0.000000e+00\n",
      "gender_female          1.624424e-170\n",
      "gender_male            4.961315e-220\n",
      "nb_campaigns           6.276733e-205\n",
      "nb_campaigns_opened    2.228531e-176\n",
      "dtype: float64\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                          Logit   Df Residuals:                   354350\n",
      "Method:                           MLE   Df Model:                           14\n",
      "Date:                Thu, 21 Mar 2024   Pseudo R-squ.:                  0.2111\n",
      "Time:                        10:45:37   Log-Likelihood:                -83152.\n",
      "converged:                       True   LL-Null:                   -1.0540e+05\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "const                  -3.1162      0.081    -38.383      0.000      -3.275      -2.957\n",
      "nb_tickets             -0.0136      0.012     -1.156      0.248      -0.037       0.009\n",
      "nb_purchases           -0.0385      0.012     -3.149      0.002      -0.063      -0.015\n",
      "total_amount            0.0588      0.018      3.325      0.001       0.024       0.094\n",
      "nb_suppliers            0.1638      0.010     17.085      0.000       0.145       0.183\n",
      "vente_internet_max     -0.8651      0.011    -82.182      0.000      -0.886      -0.844\n",
      "purchase_date_min       0.5790      0.015     39.391      0.000       0.550       0.608\n",
      "purchase_date_max      -1.4088      0.016    -89.101      0.000      -1.440      -1.378\n",
      "nb_tickets_internet     0.2857      0.013     22.475      0.000       0.261       0.311\n",
      "is_email_true           0.4224      0.079      5.363      0.000       0.268       0.577\n",
      "opt_in                 -1.9818      0.019   -106.856      0.000      -2.018      -1.945\n",
      "gender_female           0.6553      0.024     27.835      0.000       0.609       0.701\n",
      "gender_male             0.7578      0.024     31.663      0.000       0.711       0.805\n",
      "nb_campaigns            0.2835      0.009     30.547      0.000       0.265       0.302\n",
      "nb_campaigns_opened     0.2061      0.007     28.315      0.000       0.192       0.220\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# 2.bis on fait de même pour un modèle logit avec pénalité \n",
    "# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
    "\n",
    "# sans spécification, le alpha optimal est déterminé par cross validation\n",
    "# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
    "# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
    "\n",
    "result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
    "\n",
    "print(result.pvalues)\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
   "metadata": {},
   "source": [
    "### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define a function to generate summaries of logit model\n",
    "\n",
    "def model_logit(X, y, weight_dict, add_constant=False) :\n",
    "    # Generate sample weights based on class weights computed earlier\n",
    "    sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
    "\n",
    "    if add_constant :\n",
    "        X_const = sm.add_constant(X)\n",
    "    else :\n",
    "        X_const = X\n",
    "   \n",
    "    # Use GLM from statsmodels with Binomial family for logistic regression\n",
    "    model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
    "    \n",
    "    # fit without penalty\n",
    "    result = model.fit()\n",
    "\n",
    "    result_summary = result.summary()\n",
    "    \n",
    "    return result_summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 Generalized Linear Model Regression Results                  \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                            GLM   Df Residuals:                   354350\n",
      "Model Family:                Binomial   Df Model:                           14\n",
      "Link Function:                  Logit   Scale:                          1.0000\n",
      "Method:                          IRLS   Log-Likelihood:            -1.8693e+05\n",
      "Date:                Thu, 21 Mar 2024   Deviance:                   3.7387e+05\n",
      "Time:                        13:19:33   Pearson chi2:                 1.97e+16\n",
      "No. Iterations:                   100   Pseudo R-squ. (CS):             0.2820\n",
      "Covariance Type:            nonrobust                                         \n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "const                  -1.3943      0.062    -22.456      0.000      -1.516      -1.273\n",
      "nb_tickets             -0.3312      0.016    -20.967      0.000      -0.362      -0.300\n",
      "nb_purchases            0.9258      0.098      9.491      0.000       0.735       1.117\n",
      "total_amount            0.8922      0.042     21.393      0.000       0.810       0.974\n",
      "nb_suppliers            0.2238      0.007     32.137      0.000       0.210       0.237\n",
      "vente_internet_max     -0.7453      0.007   -100.473      0.000      -0.760      -0.731\n",
      "purchase_date_min       0.7123      0.015     46.063      0.000       0.682       0.743\n",
      "purchase_date_max      -1.3328      0.017    -79.297      0.000      -1.366      -1.300\n",
      "nb_tickets_internet     0.1784      0.011     16.366      0.000       0.157       0.200\n",
      "is_email_true           0.8635      0.061     14.086      0.000       0.743       0.984\n",
      "opt_in                 -1.7487      0.010   -174.737      0.000      -1.768      -1.729\n",
      "gender_female           0.8084      0.013     60.803      0.000       0.782       0.835\n",
      "gender_male             0.8731      0.014     64.332      0.000       0.846       0.900\n",
      "nb_campaigns            0.1751      0.006     31.101      0.000       0.164       0.186\n",
      "nb_campaigns_opened     0.2962      0.005     54.145      0.000       0.285       0.307\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# with the function\n",
    "\n",
    "# 1. logit with weights\n",
    "results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
    "print(results_logit_weight)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 Generalized Linear Model Regression Results                  \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:               354365\n",
      "Model:                            GLM   Df Residuals:                   354350\n",
      "Model Family:                Binomial   Df Model:                           14\n",
      "Link Function:                  Logit   Scale:                          1.0000\n",
      "Method:                          IRLS   Log-Likelihood:                -83141.\n",
      "Date:                Thu, 21 Mar 2024   Deviance:                   1.6628e+05\n",
      "Time:                        13:20:06   Pearson chi2:                 4.52e+15\n",
      "No. Iterations:                     8   Pseudo R-squ. (CS):             0.1180\n",
      "Covariance Type:            nonrobust                                         \n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "const                  -3.6025      0.091    -39.755      0.000      -3.780      -3.425\n",
      "nb_tickets             -0.0230      0.010     -2.191      0.028      -0.044      -0.002\n",
      "nb_purchases           -0.0519      0.014     -3.609      0.000      -0.080      -0.024\n",
      "total_amount            0.0799      0.021      3.841      0.000       0.039       0.121\n",
      "nb_suppliers            0.1694      0.010     17.662      0.000       0.151       0.188\n",
      "vente_internet_max     -0.8764      0.011    -82.965      0.000      -0.897      -0.856\n",
      "purchase_date_min       0.5881      0.015     39.936      0.000       0.559       0.617\n",
      "purchase_date_max      -1.4197      0.016    -89.592      0.000      -1.451      -1.389\n",
      "nb_tickets_internet     0.2895      0.013     22.652      0.000       0.264       0.315\n",
      "is_email_true           0.8651      0.088      9.797      0.000       0.692       1.038\n",
      "opt_in                 -1.9976      0.019   -107.305      0.000      -2.034      -1.961\n",
      "gender_female           0.7032      0.024     29.395      0.000       0.656       0.750\n",
      "gender_male             0.8071      0.024     33.201      0.000       0.759       0.855\n",
      "nb_campaigns            0.2850      0.009     30.633      0.000       0.267       0.303\n",
      "nb_campaigns_opened     0.2061      0.007     28.245      0.000       0.192       0.220\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# 2. logit without weights\n",
    "\n",
    "results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
    "print(results_logit)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36c5e770-72b3-4482-ad61-45b511a11f06",
   "metadata": {},
   "source": [
-    "## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
+    "## graphique LASSO - quelles variables sont importantes dans le modèle ? "
   ]
  },
  {