From c86c43cc7ed68a2007a91371a0c42af3c01194ee Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sun, 24 Mar 2024 19:01:29 +0000 Subject: [PATCH] Nouveau datasets --- 0_2_Dataset_construction.py | 25 ++++++++++++++++--------- 0_KPI_functions.py | 23 ++++++++++++----------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index a5816cb..863fbf3 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -37,11 +37,11 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601') # Filter for database df_campaigns_information - df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)] + df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)] df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT') # Filter for database df_products_purchased_reduced - df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)] + df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)] print("Data filtering : SUCCESS") @@ -65,22 +65,29 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left') # Fill NaN values - df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) - + df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0) + df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean()) + # Merge - targets features df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left') # Fill NaN values - df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']] = df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].fillna(0) + targets_columns = list(df_targets_kpi.columns) + targets_columns.remove('customer_id') + + df_customer[targets_columns] = df_customer[targets_columns].fillna(0) # We standardise the number of targets closely linked to the company's operations df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std()) # Merge - purchasing behavior features - df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'outer') + df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left') # Fill NaN values - df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) + special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase'] + simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan] + + df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0) max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1 df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval) @@ -92,7 +99,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): print("Explanatory variable construction : SUCCESS") # 2. Construction of the explained variable - df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)] + df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)] # Construction of the dependant variable df_products_purchased_to_predict['y_has_purchased'] = 1 @@ -123,7 +130,7 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee list_of_comp = companies[type_of_comp] # Export folder -BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}' +BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}' # Dates used for the construction of features and the dependant variable start_date = "2021-05-01" diff --git a/0_KPI_functions.py b/0_KPI_functions.py index 2425532..229ec89 100644 --- a/0_KPI_functions.py +++ b/0_KPI_functions.py @@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None): df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser) return df -def campaigns_kpi_function(campaigns_information = None, max_date = None): - +def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"): + # Nombre de campagnes de mails nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) # Temps d'ouverture moyen (en minutes) - campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h') + campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')) campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')) time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() @@ -99,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None): }) gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int) customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1) + customerplus_clean.drop(columns = "gender", inplace = True) + # Age customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int) @@ -111,25 +113,24 @@ def customerplus_kpi_function(customerplus_clean = None): customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int) customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int) customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0) + customerplus_clean.drop(columns = "age", inplace = True) # Consentement au mailing customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int) # Indicatrice si individue vit en France customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan) - + customerplus_clean.drop(columns = "country", inplace = True) + customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int) - + customerplus_clean.drop(columns = "profession", inplace = True) + customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int) + customerplus_clean.drop(columns = "zipcode", inplace = True) + - # Dummy if the customer has a structure id (tags) - # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0) - return customerplus_clean -def concatenate_names(names): - return ', '.join(names) - def targets_KPI(df_target = None): df_target['target_name'] = df_target['target_name'].fillna('').str.lower()