Merge branch 'main' into generalization
This commit is contained in:
commit
f4b6f23394
|
@ -1,5 +1,8 @@
|
||||||
# Business Data Challenge - Team 1
|
# Purpose of the script : Construction of training and test datasets for modelling by company
|
||||||
|
# Input : KPI construction function and clean databases in the 0_Input folder
|
||||||
|
# Output : Train and test datasets by compagnies
|
||||||
|
|
||||||
|
# Packages
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
|
@ -9,12 +12,10 @@ import warnings
|
||||||
from datetime import date, timedelta, datetime
|
from datetime import date, timedelta, datetime
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
# Create filesystem object
|
# Create filesystem object
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
# Import KPI construction functions
|
# Import KPI construction functions
|
||||||
exec(open('0_KPI_functions.py').read())
|
exec(open('0_KPI_functions.py').read())
|
||||||
|
|
||||||
|
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
|
|
||||||
# Import customerplus
|
# Import of cleaned and merged datasets
|
||||||
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||||
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||||
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||||
|
df_target_information = display_input_databases(directory_path, file_name = "target_information")
|
||||||
|
|
||||||
# if directory_path == "101":
|
# Dates in datetime format
|
||||||
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
|
|
||||||
# df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
|
|
||||||
|
|
||||||
# Filtre de cohérence pour la mise en pratique de notre méthode
|
|
||||||
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
||||||
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
||||||
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
#Filtre de la base df_campaigns_information
|
# Filter for database df_campaigns_information
|
||||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||||
|
|
||||||
#Filtre de la base df_products_purchased_reduced
|
# Filter for database df_products_purchased_reduced
|
||||||
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||||
|
|
||||||
print("Data filtering : SUCCESS")
|
print("Data filtering : SUCCESS")
|
||||||
|
|
||||||
# Fusion de l'ensemble et creation des KPI
|
# Building and merging features
|
||||||
|
|
||||||
# KPI sur les campagnes publicitaires
|
# Campaigns features
|
||||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
|
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
|
||||||
|
|
||||||
# KPI sur le comportement d'achat
|
# Purchasing behavior features
|
||||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
||||||
|
|
||||||
# KPI sur les données socio-démographiques
|
# Socio-demographic features
|
||||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||||
|
|
||||||
|
# Targets features
|
||||||
|
df_targets_kpi = targets_KPI(df_target = df_target_information)
|
||||||
|
|
||||||
print("KPIs construction : SUCCESS")
|
print("KPIs construction : SUCCESS")
|
||||||
|
|
||||||
# Fusion avec KPI liés au customer
|
# Merge - campaigns features
|
||||||
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
|
||||||
|
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
|
||||||
|
|
||||||
# Fusion avec KPI liés au comportement d'achat
|
# Merge - targets features
|
||||||
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
|
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
targets_columns = list(df_targets_kpi.columns)
|
||||||
|
targets_columns.remove('customer_id')
|
||||||
|
|
||||||
|
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
|
||||||
|
|
||||||
|
# We standardise the number of targets closely linked to the company's operations
|
||||||
|
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
|
||||||
|
|
||||||
|
# Merge - purchasing behavior features
|
||||||
|
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
|
# Fill NaN values
|
||||||
|
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
|
||||||
|
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
|
||||||
|
|
||||||
|
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
|
||||||
|
|
||||||
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
||||||
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
||||||
|
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
print("Explanatory variable construction : SUCCESS")
|
print("Explanatory variable construction : SUCCESS")
|
||||||
|
|
||||||
# 2. Construction of the explained variable
|
# 2. Construction of the explained variable
|
||||||
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
|
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
|
||||||
|
|
||||||
# Indicatrice d'achat
|
# Construction of the dependant variable
|
||||||
df_products_purchased_to_predict['y_has_purchased'] = 1
|
df_products_purchased_to_predict['y_has_purchased'] = 1
|
||||||
|
|
||||||
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
||||||
|
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
## Exportation
|
## Exportation
|
||||||
|
# Sectors
|
||||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||||
'sport': ['5', '6', '7', '8', '9'],
|
'sport': ['5', '6', '7', '8', '9'],
|
||||||
'musique' : ['10', '11', '12', '13', '14']}
|
'musique' : ['10', '11', '12', '13', '14']}
|
||||||
|
|
||||||
|
# Choosed sector
|
||||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
list_of_comp = companies[type_of_comp]
|
list_of_comp = companies[type_of_comp]
|
||||||
# Dossier d'exportation
|
|
||||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
|
|
||||||
|
|
||||||
# Create test dataset and train dataset for sport companies
|
# Export folder
|
||||||
|
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
|
||||||
|
|
||||||
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
|
|
||||||
|
|
||||||
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
|
|
||||||
|
|
||||||
|
# Dates used for the construction of features and the dependant variable
|
||||||
start_date = "2021-05-01"
|
start_date = "2021-05-01"
|
||||||
end_of_features = "2022-11-01"
|
end_of_features = "2022-11-01"
|
||||||
final_date = "2023-11-01"
|
final_date = "2023-11-01"
|
||||||
|
|
||||||
|
# Anonymous customer to be deleted from the datasets
|
||||||
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
|
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
|
||||||
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
|
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
|
||||||
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
|
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
|
||||||
|
@ -133,33 +146,23 @@ for company in list_of_comp:
|
||||||
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
||||||
max_date = final_date, directory_path = company)
|
max_date = final_date, directory_path = company)
|
||||||
|
|
||||||
# On retire le client anonyme
|
# Deletion of the anonymous customer
|
||||||
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
||||||
|
|
||||||
|
# Split between train and test
|
||||||
# #train test set
|
|
||||||
# np.random.seed(42)
|
|
||||||
|
|
||||||
# split_ratio = 0.7
|
|
||||||
# split_index = int(len(dataset) * split_ratio)
|
|
||||||
# dataset = dataset.sample(frac=1).reset_index(drop=True)
|
|
||||||
# dataset_train = dataset.iloc[:split_index]
|
|
||||||
# dataset_test = dataset.iloc[split_index:]
|
|
||||||
|
|
||||||
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
||||||
|
|
||||||
# Dataset Test
|
# Dataset Test
|
||||||
# Exportation
|
# Export
|
||||||
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
dataset_test.to_csv(file_out, index = False)
|
dataset_test.to_csv(file_out, index = False)
|
||||||
|
|
||||||
print("Exportation dataset test : SUCCESS")
|
print("Export of dataset test : SUCCESS")
|
||||||
|
|
||||||
# Dataset train
|
# Dataset train
|
||||||
|
|
||||||
# Export
|
# Export
|
||||||
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
||||||
|
@ -167,7 +170,7 @@ for company in list_of_comp:
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
dataset_train.to_csv(file_out, index = False)
|
dataset_train.to_csv(file_out, index = False)
|
||||||
|
|
||||||
print("Exportation dataset train : SUCCESS")
|
print("Export of dataset train : SUCCESS")
|
||||||
|
|
||||||
|
|
||||||
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
|
||||||
|
|
|
@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
# functions
|
# functions
|
||||||
def generate_test_set(type_of_comp):
|
def generate_test_set(type_of_comp):
|
||||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
|
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
|
||||||
test_set = pd.DataFrame()
|
test_set = pd.DataFrame()
|
||||||
for file in file_path_list:
|
for file in file_path_list:
|
||||||
print(file)
|
print(file)
|
||||||
|
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
|
||||||
|
|
||||||
|
|
||||||
def generate_train_set(type_of_comp):
|
def generate_train_set(type_of_comp):
|
||||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
|
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
|
||||||
train_set = pd.DataFrame()
|
train_set = pd.DataFrame()
|
||||||
for file in file_path_list:
|
for file in file_path_list:
|
||||||
print(file)
|
print(file)
|
||||||
|
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
|
||||||
|
|
||||||
|
|
||||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
|
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'
|
||||||
|
|
||||||
# create test and train datasets
|
# create test and train datasets
|
||||||
test_set = generate_test_set(type_of_comp)
|
test_set = generate_test_set(type_of_comp)
|
||||||
|
|
|
@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
|
||||||
cleaning_date(customerplus_copy, 'last_visiting_date')
|
cleaning_date(customerplus_copy, 'last_visiting_date')
|
||||||
|
|
||||||
# Selection des variables
|
# Selection des variables
|
||||||
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
|
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
|
||||||
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
||||||
|
|
||||||
return customerplus_copy
|
return customerplus_copy
|
||||||
|
|
|
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
|
||||||
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
|
||||||
|
|
||||||
# Nombre de campagnes de mails
|
# Nombre de campagnes de mails
|
||||||
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
||||||
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
||||||
|
|
||||||
# Temps d'ouverture moyen (en minutes)
|
# Temps d'ouverture moyen (en minutes)
|
||||||
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
|
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
|
|
||||||
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
||||||
|
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
||||||
|
|
||||||
return campaigns_reduced
|
return campaigns_reduced
|
||||||
|
|
||||||
|
|
||||||
def tickets_kpi_function(tickets_information = None):
|
def tickets_kpi_function(tickets_information = None):
|
||||||
|
|
||||||
tickets_information_copy = tickets_information.copy()
|
tickets_information_copy = tickets_information.copy()
|
||||||
|
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
|
||||||
})
|
})
|
||||||
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
||||||
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
||||||
|
customerplus_clean.drop(columns = "gender", inplace = True)
|
||||||
|
|
||||||
|
|
||||||
# Age
|
# Age
|
||||||
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
||||||
|
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
|
||||||
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
||||||
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
||||||
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
||||||
|
# customerplus_clean.drop(columns = "age", inplace = True)
|
||||||
|
|
||||||
# Consentement au mailing
|
# Consentement au mailing
|
||||||
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
||||||
|
|
||||||
# Indicatrice si individue vit en France
|
# Indicatrice si individue vit en France
|
||||||
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
||||||
|
# customerplus_clean.drop(columns = "country", inplace = True)
|
||||||
|
|
||||||
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
||||||
|
# customerplus_clean.drop(columns = "profession", inplace = True)
|
||||||
|
|
||||||
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
||||||
|
# customerplus_clean.drop(columns = "zipcode", inplace = True)
|
||||||
|
|
||||||
# Dummy if the customer has a structure id (tags)
|
|
||||||
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
|
|
||||||
|
|
||||||
return customerplus_clean
|
return customerplus_clean
|
||||||
|
|
||||||
|
def targets_KPI(df_target = None):
|
||||||
|
|
||||||
|
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
|
||||||
|
|
||||||
|
# Target name cotegory musees /
|
||||||
|
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
|
||||||
|
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
|
||||||
|
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
|
||||||
|
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
|
||||||
|
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
|
||||||
|
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
|
||||||
|
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
|
||||||
|
|
||||||
|
# Target name category for sport compagnies
|
||||||
|
df_target['target_abonne'] = ((
|
||||||
|
df_target['target_name']
|
||||||
|
.str.contains('|'.join(['abo', 'adh']), case=False)
|
||||||
|
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
|
||||||
|
).astype(int))
|
||||||
|
|
||||||
|
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
|
||||||
|
|
||||||
|
target_agg = df_target.groupby('customer_id').agg(
|
||||||
|
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
|
||||||
|
# all_targets=('target_name', concatenate_names),
|
||||||
|
# all_target_types=('target_type_name', concatenate_names)
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
|
||||||
|
|
||||||
|
return target_agg
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -65,7 +65,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 3,
|
||||||
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
|
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -115,9 +115,9 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -228,7 +228,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 9,
|
||||||
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
|
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -238,7 +238,7 @@
|
||||||
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
|
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 10,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -254,7 +254,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 58,
|
"execution_count": 10,
|
||||||
"id": "4680f202-979e-483f-89b8-9df877203bcf",
|
"id": "4680f202-979e-483f-89b8-9df877203bcf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -265,7 +265,7 @@
|
||||||
" 0.54812838])"
|
" 0.54812838])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 58,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -282,7 +282,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 65,
|
"execution_count": 11,
|
||||||
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
|
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -311,7 +311,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 258,
|
"execution_count": 12,
|
||||||
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
|
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -650,7 +650,7 @@
|
||||||
"[354365 rows x 17 columns]"
|
"[354365 rows x 17 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 258,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -662,7 +662,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 259,
|
"execution_count": 13,
|
||||||
"id": "648fb542-0186-493d-b274-be2c26a11967",
|
"id": "648fb542-0186-493d-b274-be2c26a11967",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -677,7 +677,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 260,
|
"execution_count": 14,
|
||||||
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
|
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1016,7 +1016,7 @@
|
||||||
"[354365 rows x 17 columns]"
|
"[354365 rows x 17 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 260,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1510,12 +1510,14 @@
|
||||||
"\n",
|
"\n",
|
||||||
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
|
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
|
||||||
"- ajouter un intercept\n",
|
"- ajouter un intercept\n",
|
||||||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
|
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
|
||||||
|
"\n",
|
||||||
|
"#### A recopier dans la pipeline -> section 2 bis"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 266,
|
"execution_count": 15,
|
||||||
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
|
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1817,7 +1819,7 @@
|
||||||
"[354365 rows x 15 columns]"
|
"[354365 rows x 15 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 266,
|
"execution_count": 15,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1831,7 +1833,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 267,
|
"execution_count": 16,
|
||||||
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
|
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1847,8 +1849,8 @@
|
||||||
"Dep. Variable: y No. Observations: 354365\n",
|
"Dep. Variable: y No. Observations: 354365\n",
|
||||||
"Model: Logit Df Residuals: 354350\n",
|
"Model: Logit Df Residuals: 354350\n",
|
||||||
"Method: MLE Df Model: 14\n",
|
"Method: MLE Df Model: 14\n",
|
||||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||||
"Time: 10:07:29 Log-Likelihood: -83135.\n",
|
"Time: 07:57:46 Log-Likelihood: -83135.\n",
|
||||||
"converged: True LL-Null: -1.0540e+05\n",
|
"converged: True LL-Null: -1.0540e+05\n",
|
||||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||||
"=======================================================================================\n",
|
"=======================================================================================\n",
|
||||||
|
@ -1887,7 +1889,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 268,
|
"execution_count": 17,
|
||||||
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
|
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -1908,7 +1910,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 269,
|
"execution_count": 18,
|
||||||
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
|
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -2210,7 +2212,7 @@
|
||||||
"[354365 rows x 15 columns]"
|
"[354365 rows x 15 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 269,
|
"execution_count": 18,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -2221,7 +2223,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 289,
|
"execution_count": 19,
|
||||||
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
|
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -2237,8 +2239,8 @@
|
||||||
"Dep. Variable: y No. Observations: 354365\n",
|
"Dep. Variable: y No. Observations: 354365\n",
|
||||||
"Model: Logit Df Residuals: 354350\n",
|
"Model: Logit Df Residuals: 354350\n",
|
||||||
"Method: MLE Df Model: 14\n",
|
"Method: MLE Df Model: 14\n",
|
||||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||||
"Time: 10:26:14 Log-Likelihood: -83135.\n",
|
"Time: 07:58:13 Log-Likelihood: -83135.\n",
|
||||||
"converged: True LL-Null: -1.0540e+05\n",
|
"converged: True LL-Null: -1.0540e+05\n",
|
||||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||||
"=======================================================================================\n",
|
"=======================================================================================\n",
|
||||||
|
@ -2276,12 +2278,226 @@
|
||||||
"print(result.summary())"
|
"print(result.summary())"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Optimization terminated successfully (Exit mode 0)\n",
|
||||||
|
" Current function value: 0.23562928627877766\n",
|
||||||
|
" Iterations: 240\n",
|
||||||
|
" Function evaluations: 243\n",
|
||||||
|
" Gradient evaluations: 240\n",
|
||||||
|
"const 0.000000e+00\n",
|
||||||
|
"nb_tickets 2.477006e-01\n",
|
||||||
|
"nb_purchases 1.636902e-03\n",
|
||||||
|
"total_amount 8.839088e-04\n",
|
||||||
|
"nb_suppliers 1.906550e-65\n",
|
||||||
|
"vente_internet_max 0.000000e+00\n",
|
||||||
|
"purchase_date_min 0.000000e+00\n",
|
||||||
|
"purchase_date_max 0.000000e+00\n",
|
||||||
|
"nb_tickets_internet 7.232680e-112\n",
|
||||||
|
"is_email_true 8.202187e-08\n",
|
||||||
|
"opt_in 0.000000e+00\n",
|
||||||
|
"gender_female 1.624424e-170\n",
|
||||||
|
"gender_male 4.961315e-220\n",
|
||||||
|
"nb_campaigns 6.276733e-205\n",
|
||||||
|
"nb_campaigns_opened 2.228531e-176\n",
|
||||||
|
"dtype: float64\n",
|
||||||
|
" Logit Regression Results \n",
|
||||||
|
"==============================================================================\n",
|
||||||
|
"Dep. Variable: y No. Observations: 354365\n",
|
||||||
|
"Model: Logit Df Residuals: 354350\n",
|
||||||
|
"Method: MLE Df Model: 14\n",
|
||||||
|
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
|
||||||
|
"Time: 10:45:37 Log-Likelihood: -83152.\n",
|
||||||
|
"converged: True LL-Null: -1.0540e+05\n",
|
||||||
|
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||||
|
"=======================================================================================\n",
|
||||||
|
" coef std err z P>|z| [0.025 0.975]\n",
|
||||||
|
"---------------------------------------------------------------------------------------\n",
|
||||||
|
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
|
||||||
|
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
|
||||||
|
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
|
||||||
|
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
|
||||||
|
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
|
||||||
|
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
|
||||||
|
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
|
||||||
|
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
|
||||||
|
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
|
||||||
|
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
|
||||||
|
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
|
||||||
|
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
|
||||||
|
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
|
||||||
|
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
|
||||||
|
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
|
||||||
|
"=======================================================================================\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
|
||||||
|
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
|
||||||
|
"\n",
|
||||||
|
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
|
||||||
|
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
|
||||||
|
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
|
||||||
|
"\n",
|
||||||
|
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
|
||||||
|
"\n",
|
||||||
|
"print(result.pvalues)\n",
|
||||||
|
"print(result.summary())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 247,
|
||||||
|
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# define a function to generate summaries of logit model\n",
|
||||||
|
"\n",
|
||||||
|
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
|
||||||
|
" # Generate sample weights based on class weights computed earlier\n",
|
||||||
|
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
|
||||||
|
"\n",
|
||||||
|
" if add_constant :\n",
|
||||||
|
" X_const = sm.add_constant(X)\n",
|
||||||
|
" else :\n",
|
||||||
|
" X_const = X\n",
|
||||||
|
" \n",
|
||||||
|
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
|
||||||
|
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
|
||||||
|
" \n",
|
||||||
|
" # fit without penalty\n",
|
||||||
|
" result = model.fit()\n",
|
||||||
|
"\n",
|
||||||
|
" result_summary = result.summary()\n",
|
||||||
|
" \n",
|
||||||
|
" return result_summary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 248,
|
||||||
|
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Generalized Linear Model Regression Results \n",
|
||||||
|
"==============================================================================\n",
|
||||||
|
"Dep. Variable: y No. Observations: 354365\n",
|
||||||
|
"Model: GLM Df Residuals: 354350\n",
|
||||||
|
"Model Family: Binomial Df Model: 14\n",
|
||||||
|
"Link Function: Logit Scale: 1.0000\n",
|
||||||
|
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
|
||||||
|
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
|
||||||
|
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
|
||||||
|
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
|
||||||
|
"Covariance Type: nonrobust \n",
|
||||||
|
"=======================================================================================\n",
|
||||||
|
" coef std err z P>|z| [0.025 0.975]\n",
|
||||||
|
"---------------------------------------------------------------------------------------\n",
|
||||||
|
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
|
||||||
|
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
|
||||||
|
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
|
||||||
|
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
|
||||||
|
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
|
||||||
|
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
|
||||||
|
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
|
||||||
|
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
|
||||||
|
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
|
||||||
|
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
|
||||||
|
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
|
||||||
|
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
|
||||||
|
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
|
||||||
|
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
|
||||||
|
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
|
||||||
|
"=======================================================================================\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# with the function\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. logit with weights\n",
|
||||||
|
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
|
||||||
|
"print(results_logit_weight)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 252,
|
||||||
|
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Generalized Linear Model Regression Results \n",
|
||||||
|
"==============================================================================\n",
|
||||||
|
"Dep. Variable: y No. Observations: 354365\n",
|
||||||
|
"Model: GLM Df Residuals: 354350\n",
|
||||||
|
"Model Family: Binomial Df Model: 14\n",
|
||||||
|
"Link Function: Logit Scale: 1.0000\n",
|
||||||
|
"Method: IRLS Log-Likelihood: -83141.\n",
|
||||||
|
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
|
||||||
|
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
|
||||||
|
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
|
||||||
|
"Covariance Type: nonrobust \n",
|
||||||
|
"=======================================================================================\n",
|
||||||
|
" coef std err z P>|z| [0.025 0.975]\n",
|
||||||
|
"---------------------------------------------------------------------------------------\n",
|
||||||
|
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
|
||||||
|
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
|
||||||
|
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
|
||||||
|
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
|
||||||
|
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
|
||||||
|
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
|
||||||
|
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
|
||||||
|
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
|
||||||
|
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
|
||||||
|
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
|
||||||
|
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
|
||||||
|
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
|
||||||
|
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
|
||||||
|
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
|
||||||
|
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
|
||||||
|
"=======================================================================================\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# 2. logit without weights\n",
|
||||||
|
"\n",
|
||||||
|
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
|
||||||
|
"print(results_logit)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
|
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
|
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue
Block a user