Nouveau datasets

2024-03-24 19:01:29 +00:00 · 2024-03-24 19:01:29 +00:00 · c86c43cc7e
commit c86c43cc7e
parent f5f993aba0
2 changed files with 28 additions and 20 deletions
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -37,11 +37,11 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
    # Filter for database df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
    # Filter for database df_products_purchased_reduced
-    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+    df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
    print("Data filtering : SUCCESS")
@ -65,22 +65,29 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+    df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
-
+    df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
    # Merge - targets features
    df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
-    df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']] = df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].fillna(0)
+    targets_columns = list(df_targets_kpi.columns)
    targets_columns.remove('customer_id')
    df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
    # We standardise the number of targets closely linked to the company's operations
    df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
    # Merge - purchasing behavior features
-    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'outer')
+    df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+    special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']    
    simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
    df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
    max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
    df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -92,7 +99,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
    print("Explanatory variable construction : SUCCESS")
    # 2. Construction of the explained variable 
-    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
    # Construction of the dependant variable
    df_products_purchased_to_predict['y_has_purchased'] = 1
@ -123,7 +130,7 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee
 list_of_comp = companies[type_of_comp] 
 # Export folder
-BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
 # Dates used for the construction of features and the dependant variable
 start_date = "2021-05-01"
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
    return df       
-def campaigns_kpi_function(campaigns_information = None, max_date = None):
+def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
-     
+    
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
    # Temps d'ouverture moyen (en minutes)
-    campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
+    campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -99,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
    })
    gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
    customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
    customerplus_clean.drop(columns = "gender", inplace = True)
    # Age
    customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -111,25 +113,24 @@ def customerplus_kpi_function(customerplus_clean = None):
    customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
    customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
    customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
    customerplus_clean.drop(columns = "age", inplace = True)
    # Consentement au mailing
    customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
    # Indicatrice si individue vit en France
    customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
-
+    customerplus_clean.drop(columns = "country", inplace = True)
    customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
-    
+    customerplus_clean.drop(columns = "profession", inplace = True)
    customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
    customerplus_clean.drop(columns = "zipcode", inplace = True)
    # Dummy if the customer has a structure id (tags)
    # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
    return customerplus_clean
 def concatenate_names(names):
    return ', '.join(names)
 def targets_KPI(df_target = None):
    df_target['target_name'] = df_target['target_name'].fillna('').str.lower()