From c6abfbe76ed1e2a07d3d84b21ddadfc05f89209b Mon Sep 17 00:00:00 2001
From: ajoubrel-ensae <antoine.joubrel@ensae.fr>
Date: Sat, 9 Mar 2024 17:50:32 +0000
Subject: [PATCH] Correction construction

---
 0_2_Dataset_construction.py | 12 ++++++++----
 0_KPI_functions.py          |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py
index 543a286..75c8259 100644
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@@ -110,6 +110,10 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     # Fill NaN values
     df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
 
+    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
+    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
+    df_customer_product[['time_between_purchase']] = df_customer_product[['time_between_purchase']].fillna(-1)
+    
     print("Explanatory variable construction : SUCCESS")
 
     # 2. Construction of the explained variable 
@@ -126,7 +130,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
 
     # 0 if there is no purchase
-    dataset[['y_has_purchased']].fillna(0)
+    dataset[['y_has_purchased']] = dataset[['y_has_purchased']].fillna(0)
 
     # add id_company prefix to customer_id
     dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')
@@ -156,9 +160,9 @@ end_of_features = "2022-11-01"
 final_date = "2023-11-01"
 
 
-anonymous_customer = {'1' : 1_1, '2' : 2_12184, '3' : 3_1, '4' : 4_2, '101' : 101_1,
-                      '5' : 5_191835, '6' : 6_591412, '7' : 7_49632, '8' : 8_1942, '9' : 9_19683,
-                     '10' : 10_19521, '11' : 11_36 , '12' : 12_1706757, '13' : 13_8422, '14' : 14_6354 }
+anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
+                      '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
+                     '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
 
 for company in list_of_comp:
     dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
diff --git a/0_KPI_functions.py b/0_KPI_functions.py
index 837e785..f991ced 100644
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@@ -18,7 +18,8 @@ def campaigns_kpi_function(campaigns_information = None):
     # Nombre de campagnes de mails
     nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
     nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
-    # Temps d'ouverture en min moyen    
+    
+    # Temps d'ouverture moyen (en minutes)
     campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
     time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()