From c86c43cc7ed68a2007a91371a0c42af3c01194ee Mon Sep 17 00:00:00 2001
From: ajoubrel-ensae <antoine.joubrel@ensae.fr>
Date: Sun, 24 Mar 2024 19:01:29 +0000
Subject: [PATCH] Nouveau datasets

---
 0_2_Dataset_construction.py | 25 ++++++++++++++++---------
 0_KPI_functions.py          | 23 ++++++++++++-----------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py
index a5816cb..863fbf3 100644
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@@ -37,11 +37,11 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
 
     # Filter for database df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
     df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
     
     # Filter for database df_products_purchased_reduced
-    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
 
     print("Data filtering : SUCCESS")
     
@@ -65,22 +65,29 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
     
     # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
-
+    df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
+    df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
+    
     # Merge - targets features
     df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
     
     # Fill NaN values
-    df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']] = df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].fillna(0)
+    targets_columns = list(df_targets_kpi.columns)
+    targets_columns.remove('customer_id')
+
+    df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
 
     # We standardise the number of targets closely linked to the company's operations
     df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
     
     # Merge - purchasing behavior features
-    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'outer')
+    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
     
     # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+    special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']    
+    simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
+
+    df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
 
     max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
     df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@@ -92,7 +99,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     print("Explanatory variable construction : SUCCESS")
 
     # 2. Construction of the explained variable 
-    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
 
     # Construction of the dependant variable
     df_products_purchased_to_predict['y_has_purchased'] = 1
@@ -123,7 +130,7 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee
 list_of_comp = companies[type_of_comp] 
 
 # Export folder
-BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
 
 # Dates used for the construction of features and the dependant variable
 start_date = "2021-05-01"
diff --git a/0_KPI_functions.py b/0_KPI_functions.py
index 2425532..229ec89 100644
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
         df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
     return df       
 
-def campaigns_kpi_function(campaigns_information = None, max_date = None):
-     
+def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
+    
     # Nombre de campagnes de mails
     nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
     nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
     
     # Temps d'ouverture moyen (en minutes)
-    campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
+    campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
     campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
     
     time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@@ -99,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
     })
     gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
     customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
+    customerplus_clean.drop(columns = "gender", inplace = True)
+
 
     # Age
     customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@@ -111,25 +113,24 @@ def customerplus_kpi_function(customerplus_clean = None):
     customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
     customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
     customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
+    customerplus_clean.drop(columns = "age", inplace = True)
 
     # Consentement au mailing
     customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
     
     # Indicatrice si individue vit en France
     customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
-
+    customerplus_clean.drop(columns = "country", inplace = True)
+    
     customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
-    
+    customerplus_clean.drop(columns = "profession", inplace = True)
+
     customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
+    customerplus_clean.drop(columns = "zipcode", inplace = True)
+    
     
-    # Dummy if the customer has a structure id (tags)
-    # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
-
     return customerplus_clean
 
-def concatenate_names(names):
-    return ', '.join(names)
-
 def targets_KPI(df_target = None):
     
     df_target['target_name'] = df_target['target_name'].fillna('').str.lower()