Merge branch 'main' into generalization

2024-03-10 08:46:23 +00:00 · 2024-03-10 08:46:23 +00:00 · 198ef45247
commit 198ef45247
parent d3fa9f6870 0eedea6e26
4 changed files with 3780 additions and 116 deletions
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -110,6 +110,10 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    # Fill NaN values
    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)

+    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
+    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
+    df_customer_product[['time_between_purchase']] = df_customer_product[['time_between_purchase']].fillna(-1)
+    
    print("Explanatory variable construction : SUCCESS")

    # 2. Construction of the explained variable 
@ -126,7 +130,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')

    # 0 if there is no purchase
-    dataset[['y_has_purchased']].fillna(0)
+    dataset[['y_has_purchased']] = dataset[['y_has_purchased']].fillna(0)

    # add id_company prefix to customer_id
    dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')
@ -156,9 +160,9 @@ end_of_features = "2022-11-01"
 final_date = "2023-11-01"


-anonymous_customer = {'1' : 1_1, '2' : 2_12184, '3' : 3_1, '4' : 4_2, '101' : 101_1,
-                      '5' : 5_191835, '6' : 6_591412, '7' : 7_49632, '8' : 8_1942, '9' : 9_19683,
-                     '10' : 10_19521, '11' : 11_36 , '12' : 12_1706757, '13' : 13_8422, '14' : 14_6354 }
+anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
+                      '5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
+                     '10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}

 for company in list_of_comp:
    dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -18,7 +18,8 @@ def campaigns_kpi_function(campaigns_information = None):
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
-    # Temps d'ouverture en min moyen    
+    
+    # Temps d'ouverture moyen (en minutes)
    campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()

--- a/Musee/1_Descriptive_Statistics_Museum.ipynb
+++ b/Musee/1_Descriptive_Statistics_Museum.ipynb
--- a/Musee/2_modelisation_pipeline+visu.ipynb
+++ b/Musee/2_modelisation_pipeline+visu.ipynb