generalization #11

Merged
arevelle-ensae merged 10 commits from generalization into main 2024-03-28 09:40:04 +01:00
8 changed files with 1059 additions and 12764 deletions
Showing only changes of commit f4b6f23394 - Show all commits

View File

@ -1,5 +1,8 @@
# Business Data Challenge - Team 1 # Purpose of the script : Construction of training and test datasets for modelling by company
# Input : KPI construction function and clean databases in the 0_Input folder
# Output : Train and test datasets by compagnies
# Packages
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import os import os
@ -9,12 +12,10 @@ import warnings
from datetime import date, timedelta, datetime from datetime import date, timedelta, datetime
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# Create filesystem object # Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import KPI construction functions # Import KPI construction functions
exec(open('0_KPI_functions.py').read()) exec(open('0_KPI_functions.py').read())
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
def dataset_construction(min_date, end_features_date, max_date, directory_path): def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus # Import of cleaned and merged datasets
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned") df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
# if directory_path == "101":
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
# df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
# Filtre de cohérence pour la mise en pratique de notre méthode # Dates in datetime format
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601') end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601') min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
#Filtre de la base df_campaigns_information # Filter for database df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)] df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT') df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
#Filtre de la base df_products_purchased_reduced # Filter for database df_products_purchased_reduced
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)] df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS") print("Data filtering : SUCCESS")
# Fusion de l'ensemble et creation des KPI # Building and merging features
# KPI sur les campagnes publicitaires # Campaigns features
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date) df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
# KPI sur le comportement d'achat # Purchasing behavior features
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features) df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
# KPI sur les données socio-démographiques # Socio-demographic features
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0) df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# Targets features
df_targets_kpi = targets_KPI(df_target = df_target_information)
print("KPIs construction : SUCCESS") print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer # Merge - campaigns features
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left') df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values # Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
# Fusion avec KPI liés au comportement d'achat # Merge - targets features
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer') df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values # Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) targets_columns = list(df_targets_kpi.columns)
targets_columns.remove('customer_id')
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
# We standardise the number of targets closely linked to the company's operations
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
# Merge - purchasing behavior features
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1 max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval) df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
print("Explanatory variable construction : SUCCESS") print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable # 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)] df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
# Indicatrice d'achat # Construction of the dependant variable
df_products_purchased_to_predict['y_has_purchased'] = 1 df_products_purchased_to_predict['y_has_purchased'] = 1
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates() y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
return dataset return dataset
## Exportation ## Exportation
# Sectors
companies = {'musee' : ['1', '2', '3', '4'], # , '101' companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6', '7', '8', '9'], 'sport': ['5', '6', '7', '8', '9'],
'musique' : ['10', '11', '12', '13', '14']} 'musique' : ['10', '11', '12', '13', '14']}
# Choosed sector
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?') type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_comp] list_of_comp = companies[type_of_comp]
# Dossier d'exportation
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
# Create test dataset and train dataset for sport companies # Export folder
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
# Dates used for the construction of features and the dependant variable
start_date = "2021-05-01" start_date = "2021-05-01"
end_of_features = "2022-11-01" end_of_features = "2022-11-01"
final_date = "2023-11-01" final_date = "2023-11-01"
# Anonymous customer to be deleted from the datasets
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1', anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683', '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'} '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
@ -133,33 +146,23 @@ for company in list_of_comp:
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features, dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
max_date = final_date, directory_path = company) max_date = final_date, directory_path = company)
# On retire le client anonyme # Deletion of the anonymous customer
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]] dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
# Split between train and test
# #train test set
# np.random.seed(42)
# split_ratio = 0.7
# split_index = int(len(dataset) * split_ratio)
# dataset = dataset.sample(frac=1).reset_index(drop=True)
# dataset_train = dataset.iloc[:split_index]
# dataset_test = dataset.iloc[split_index:]
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42) dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
# Dataset Test # Dataset Test
# Exportation # Export
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv" FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3 FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_test.to_csv(file_out, index = False) dataset_test.to_csv(file_out, index = False)
print("Exportation dataset test : SUCCESS") print("Export of dataset test : SUCCESS")
# Dataset train # Dataset train
# Export # Export
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3 FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
@ -167,7 +170,7 @@ for company in list_of_comp:
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_train.to_csv(file_out, index = False) dataset_train.to_csv(file_out, index = False)
print("Exportation dataset train : SUCCESS") print("Export of dataset train : SUCCESS")
print("FIN DE LA GENERATION DES DATASETS : SUCCESS") print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")

View File

@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')
# functions # functions
def generate_test_set(type_of_comp): def generate_test_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set") file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
test_set = pd.DataFrame() test_set = pd.DataFrame()
for file in file_path_list: for file in file_path_list:
print(file) print(file)
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
def generate_train_set(type_of_comp): def generate_train_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set") file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
train_set = pd.DataFrame() train_set = pd.DataFrame()
for file in file_path_list: for file in file_path_list:
print(file) print(file)
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?') type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/' BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'
# create test and train datasets # create test and train datasets
test_set = generate_test_set(type_of_comp) test_set = generate_test_set(type_of_comp)

View File

@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
cleaning_date(customerplus_copy, 'last_visiting_date') cleaning_date(customerplus_copy, 'last_visiting_date')
# Selection des variables # Selection des variables
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True) customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True) customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
return customerplus_copy return customerplus_copy

View File

@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser) df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df return df
def campaigns_kpi_function(campaigns_information = None, max_date = None): def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
# Nombre de campagnes de mails # Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes) # Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h') campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')) campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
return campaigns_reduced return campaigns_reduced
def tickets_kpi_function(tickets_information = None): def tickets_kpi_function(tickets_information = None):
tickets_information_copy = tickets_information.copy() tickets_information_copy = tickets_information.copy()
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
}) })
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int) gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1) customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
customerplus_clean.drop(columns = "gender", inplace = True)
# Age # Age
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int) customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int) customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int) customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0) customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
# customerplus_clean.drop(columns = "age", inplace = True)
# Consentement au mailing # Consentement au mailing
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int) customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
# Indicatrice si individue vit en France # Indicatrice si individue vit en France
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan) customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
# customerplus_clean.drop(columns = "country", inplace = True)
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int) customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
# customerplus_clean.drop(columns = "profession", inplace = True)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
# Dummy if the customer has a structure id (tags)
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
# customerplus_clean.drop(columns = "zipcode", inplace = True)
return customerplus_clean return customerplus_clean
def targets_KPI(df_target = None):
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
# Target name cotegory musees /
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
# Target name category for sport compagnies
df_target['target_abonne'] = ((
df_target['target_name']
.str.contains('|'.join(['abo', 'adh']), case=False)
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
).astype(int))
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
target_agg = df_target.groupby('customer_id').agg(
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
# all_targets=('target_name', concatenate_names),
# all_target_types=('target_type_name', concatenate_names)
).reset_index()
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
return target_agg

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -65,7 +65,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 3,
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7", "id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -115,9 +115,9 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", "/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", "/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n" " dataset_test = pd.read_csv(file_in, sep=\",\")\n"
] ]
} }
@ -228,7 +228,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 9,
"id": "6224fd31-c190-4168-b395-e0bf5806d79d", "id": "6224fd31-c190-4168-b395-e0bf5806d79d",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -238,7 +238,7 @@
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}" "{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
] ]
}, },
"execution_count": 10, "execution_count": 9,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -254,7 +254,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 58, "execution_count": 10,
"id": "4680f202-979e-483f-89b8-9df877203bcf", "id": "4680f202-979e-483f-89b8-9df877203bcf",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -265,7 +265,7 @@
" 0.54812838])" " 0.54812838])"
] ]
}, },
"execution_count": 58, "execution_count": 10,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -282,7 +282,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 65, "execution_count": 11,
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee", "id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -311,7 +311,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 258, "execution_count": 12,
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc", "id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -650,7 +650,7 @@
"[354365 rows x 17 columns]" "[354365 rows x 17 columns]"
] ]
}, },
"execution_count": 258, "execution_count": 12,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -662,7 +662,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 259, "execution_count": 13,
"id": "648fb542-0186-493d-b274-be2c26a11967", "id": "648fb542-0186-493d-b274-be2c26a11967",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -677,7 +677,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 260, "execution_count": 14,
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482", "id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1016,7 +1016,7 @@
"[354365 rows x 17 columns]" "[354365 rows x 17 columns]"
] ]
}, },
"execution_count": 260, "execution_count": 14,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1510,12 +1510,14 @@
"\n", "\n",
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n", "- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
"- ajouter un intercept\n", "- ajouter un intercept\n",
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO " "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
"\n",
"#### A recopier dans la pipeline -> section 2 bis"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 266, "execution_count": 15,
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb", "id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1817,7 +1819,7 @@
"[354365 rows x 15 columns]" "[354365 rows x 15 columns]"
] ]
}, },
"execution_count": 266, "execution_count": 15,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1831,7 +1833,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 267, "execution_count": 16,
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2", "id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1847,8 +1849,8 @@
"Dep. Variable: y No. Observations: 354365\n", "Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n", "Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n", "Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n", "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:07:29 Log-Likelihood: -83135.\n", "Time: 07:57:46 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n", "converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n", "Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n", "=======================================================================================\n",
@ -1887,7 +1889,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 268, "execution_count": 17,
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271", "id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1908,7 +1910,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 269, "execution_count": 18,
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d", "id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2210,7 +2212,7 @@
"[354365 rows x 15 columns]" "[354365 rows x 15 columns]"
] ]
}, },
"execution_count": 269, "execution_count": 18,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2221,7 +2223,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 289, "execution_count": 19,
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0", "id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2237,8 +2239,8 @@
"Dep. Variable: y No. Observations: 354365\n", "Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n", "Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n", "Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n", "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:26:14 Log-Likelihood: -83135.\n", "Time: 07:58:13 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n", "converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n", "Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n", "=======================================================================================\n",
@ -2276,12 +2278,226 @@
"print(result.summary())" "print(result.summary())"
] ]
}, },
{
"cell_type": "code",
"execution_count": 48,
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully (Exit mode 0)\n",
" Current function value: 0.23562928627877766\n",
" Iterations: 240\n",
" Function evaluations: 243\n",
" Gradient evaluations: 240\n",
"const 0.000000e+00\n",
"nb_tickets 2.477006e-01\n",
"nb_purchases 1.636902e-03\n",
"total_amount 8.839088e-04\n",
"nb_suppliers 1.906550e-65\n",
"vente_internet_max 0.000000e+00\n",
"purchase_date_min 0.000000e+00\n",
"purchase_date_max 0.000000e+00\n",
"nb_tickets_internet 7.232680e-112\n",
"is_email_true 8.202187e-08\n",
"opt_in 0.000000e+00\n",
"gender_female 1.624424e-170\n",
"gender_male 4.961315e-220\n",
"nb_campaigns 6.276733e-205\n",
"nb_campaigns_opened 2.228531e-176\n",
"dtype: float64\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
"Time: 10:45:37 Log-Likelihood: -83152.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
"\n",
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
"\n",
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
"\n",
"print(result.pvalues)\n",
"print(result.summary())"
]
},
{
"cell_type": "markdown",
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
"metadata": {},
"source": [
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
]
},
{
"cell_type": "code",
"execution_count": 247,
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
"metadata": {},
"outputs": [],
"source": [
"# define a function to generate summaries of logit model\n",
"\n",
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
" # Generate sample weights based on class weights computed earlier\n",
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
"\n",
" if add_constant :\n",
" X_const = sm.add_constant(X)\n",
" else :\n",
" X_const = X\n",
" \n",
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
" \n",
" # fit without penalty\n",
" result = model.fit()\n",
"\n",
" result_summary = result.summary()\n",
" \n",
" return result_summary"
]
},
{
"cell_type": "code",
"execution_count": 248,
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
"=======================================================================================\n"
]
}
],
"source": [
"# with the function\n",
"\n",
"# 1. logit with weights\n",
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
"print(results_logit_weight)"
]
},
{
"cell_type": "code",
"execution_count": 252,
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -83141.\n",
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2. logit without weights\n",
"\n",
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
"print(results_logit)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "36c5e770-72b3-4482-ad61-45b511a11f06", "id": "36c5e770-72b3-4482-ad61-45b511a11f06",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? " "## graphique LASSO - quelles variables sont importantes dans le modèle ? "
] ]
}, },
{ {