Nouveau datasets
This commit is contained in:
parent
f5f993aba0
commit
c86c43cc7e
|
@ -37,11 +37,11 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
# Filter for database df_campaigns_information
|
# Filter for database df_campaigns_information
|
||||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||||
|
|
||||||
# Filter for database df_products_purchased_reduced
|
# Filter for database df_products_purchased_reduced
|
||||||
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||||
|
|
||||||
print("Data filtering : SUCCESS")
|
print("Data filtering : SUCCESS")
|
||||||
|
|
||||||
|
@ -65,22 +65,29 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
|
||||||
|
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
|
||||||
|
|
||||||
# Merge - targets features
|
# Merge - targets features
|
||||||
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
|
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']] = df_customer[['nb_targets', 'target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].fillna(0)
|
targets_columns = list(df_targets_kpi.columns)
|
||||||
|
targets_columns.remove('customer_id')
|
||||||
|
|
||||||
|
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
|
||||||
|
|
||||||
# We standardise the number of targets closely linked to the company's operations
|
# We standardise the number of targets closely linked to the company's operations
|
||||||
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
|
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
|
||||||
|
|
||||||
# Merge - purchasing behavior features
|
# Merge - purchasing behavior features
|
||||||
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'outer')
|
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
|
||||||
|
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
|
||||||
|
|
||||||
|
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
|
||||||
|
|
||||||
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
||||||
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
||||||
|
@ -92,7 +99,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
print("Explanatory variable construction : SUCCESS")
|
print("Explanatory variable construction : SUCCESS")
|
||||||
|
|
||||||
# 2. Construction of the explained variable
|
# 2. Construction of the explained variable
|
||||||
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
|
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
|
||||||
|
|
||||||
# Construction of the dependant variable
|
# Construction of the dependant variable
|
||||||
df_products_purchased_to_predict['y_has_purchased'] = 1
|
df_products_purchased_to_predict['y_has_purchased'] = 1
|
||||||
|
@ -123,7 +130,7 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee
|
||||||
list_of_comp = companies[type_of_comp]
|
list_of_comp = companies[type_of_comp]
|
||||||
|
|
||||||
# Export folder
|
# Export folder
|
||||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
|
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
|
||||||
|
|
||||||
# Dates used for the construction of features and the dependant variable
|
# Dates used for the construction of features and the dependant variable
|
||||||
start_date = "2021-05-01"
|
start_date = "2021-05-01"
|
||||||
|
|
|
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
|
||||||
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
|
||||||
|
|
||||||
# Nombre de campagnes de mails
|
# Nombre de campagnes de mails
|
||||||
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
||||||
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
||||||
|
|
||||||
# Temps d'ouverture moyen (en minutes)
|
# Temps d'ouverture moyen (en minutes)
|
||||||
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
|
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
|
|
||||||
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
||||||
|
@ -99,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
|
||||||
})
|
})
|
||||||
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
||||||
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
||||||
|
customerplus_clean.drop(columns = "gender", inplace = True)
|
||||||
|
|
||||||
|
|
||||||
# Age
|
# Age
|
||||||
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
||||||
|
@ -111,25 +113,24 @@ def customerplus_kpi_function(customerplus_clean = None):
|
||||||
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
||||||
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
||||||
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
||||||
|
customerplus_clean.drop(columns = "age", inplace = True)
|
||||||
|
|
||||||
# Consentement au mailing
|
# Consentement au mailing
|
||||||
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
||||||
|
|
||||||
# Indicatrice si individue vit en France
|
# Indicatrice si individue vit en France
|
||||||
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
||||||
|
customerplus_clean.drop(columns = "country", inplace = True)
|
||||||
|
|
||||||
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
||||||
|
customerplus_clean.drop(columns = "profession", inplace = True)
|
||||||
|
|
||||||
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
||||||
|
customerplus_clean.drop(columns = "zipcode", inplace = True)
|
||||||
|
|
||||||
|
|
||||||
# Dummy if the customer has a structure id (tags)
|
|
||||||
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
|
|
||||||
|
|
||||||
return customerplus_clean
|
return customerplus_clean
|
||||||
|
|
||||||
def concatenate_names(names):
|
|
||||||
return ', '.join(names)
|
|
||||||
|
|
||||||
def targets_KPI(df_target = None):
|
def targets_KPI(df_target = None):
|
||||||
|
|
||||||
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
|
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user