From c6abfbe76ed1e2a07d3d84b21ddadfc05f89209b Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 9 Mar 2024 17:50:32 +0000 Subject: [PATCH] Correction construction --- 0_2_Dataset_construction.py | 12 ++++++++---- 0_KPI_functions.py | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index 543a286..75c8259 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -110,6 +110,10 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): # Fill NaN values df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) + max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1 + df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval) + df_customer_product[['time_between_purchase']] = df_customer_product[['time_between_purchase']].fillna(-1) + print("Explanatory variable construction : SUCCESS") # 2. Construction of the explained variable @@ -126,7 +130,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left') # 0 if there is no purchase - dataset[['y_has_purchased']].fillna(0) + dataset[['y_has_purchased']] = dataset[['y_has_purchased']].fillna(0) # add id_company prefix to customer_id dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str') @@ -156,9 +160,9 @@ end_of_features = "2022-11-01" final_date = "2023-11-01" -anonymous_customer = {'1' : 1_1, '2' : 2_12184, '3' : 3_1, '4' : 4_2, '101' : 101_1, - '5' : 5_191835, '6' : 6_591412, '7' : 7_49632, '8' : 8_1942, '9' : 9_19683, - '10' : 10_19521, '11' : 11_36 , '12' : 12_1706757, '13' : 13_8422, '14' : 14_6354 } +anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1', + '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683', + '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'} for company in list_of_comp: dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features, diff --git a/0_KPI_functions.py b/0_KPI_functions.py index 837e785..f991ced 100644 --- a/0_KPI_functions.py +++ b/0_KPI_functions.py @@ -18,7 +18,8 @@ def campaigns_kpi_function(campaigns_information = None): # Nombre de campagnes de mails nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) - # Temps d'ouverture en min moyen + + # Temps d'ouverture moyen (en minutes) campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()