Nouveau datasets

This commit is contained in:
Antoine JOUBREL 2024-03-24 19:01:29 +00:00
parent f5f993aba0
commit c86c43cc7e
2 changed files with 28 additions and 20 deletions

View File

@ -37,11 +37,11 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
# Filter for database df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
# Filter for database df_products_purchased_reduced
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
@ -65,22 +65,29 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
# Merge - targets features
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']] = df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].fillna(0)
targets_columns = list(df_targets_kpi.columns)
targets_columns.remove('customer_id')
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
# We standardise the number of targets closely linked to the company's operations
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
# Merge - purchasing behavior features
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'outer')
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -92,7 +99,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
# Construction of the dependant variable
df_products_purchased_to_predict['y_has_purchased'] = 1
@ -123,7 +130,7 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee
list_of_comp = companies[type_of_comp]
# Export folder
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
# Dates used for the construction of features and the dependant variable
start_date = "2021-05-01"

View File

@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None, max_date = None):
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -99,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
})
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
customerplus_clean.drop(columns = "gender", inplace = True)
# Age
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -111,25 +113,24 @@ def customerplus_kpi_function(customerplus_clean = None):
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
customerplus_clean.drop(columns = "age", inplace = True)
# Consentement au mailing
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
# Indicatrice si individue vit en France
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
customerplus_clean.drop(columns = "country", inplace = True)
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
customerplus_clean.drop(columns = "profession", inplace = True)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
customerplus_clean.drop(columns = "zipcode", inplace = True)
# Dummy if the customer has a structure id (tags)
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
return customerplus_clean
def concatenate_names(names):
return ', '.join(names)
def targets_KPI(df_target = None):
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()