From 52fd738fe512c4c1f25ca81619d59c8c86886a0b Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 21 Mar 2024 10:47:40 +0000 Subject: [PATCH] fix errors --- 0_4_Generate_stat_desc.py | 4 +++- 0_5_Machine_Learning.py | 8 ++++---- utils_stat_desc.py | 32 +++++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/0_4_Generate_stat_desc.py b/0_4_Generate_stat_desc.py index c0821e0..160e568 100644 --- a/0_4_Generate_stat_desc.py +++ b/0_4_Generate_stat_desc.py @@ -47,7 +47,9 @@ customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin( # Generate graph and automatically saved them in the bucket compute_nb_clients(customer, type_of_activity) -maximum_price_paid(customer, type_of_activity) +#maximum_price_paid(customer, type_of_activity) + +target_proportion(customer, type_of_activity) mailing_consent(customer, type_of_activity) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index f6c162b..4e43afd 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -55,16 +55,16 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} -numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', - 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', - 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] +numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] numeric_transformer = Pipeline(steps=[ #("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler()) ]) -categorical_features = ['opt_in'] +categorical_features = ['opt_in', 'gender_male', 'gender_female'] # Transformer for the categorical features categorical_transformer = Pipeline(steps=[ diff --git a/utils_stat_desc.py b/utils_stat_desc.py index 6372c63..469569a 100644 --- a/utils_stat_desc.py +++ b/utils_stat_desc.py @@ -42,7 +42,14 @@ def load_files(nb_compagnie): df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') - + + # Remove companies' outliers + df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi) + # harmonize set of customers across databases + customer_id = df_tickets_kpi['customer_id'].to_list() + for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]: + dataset = dataset[dataset['customer_id'].isin(customer_id)] + # Concaténation customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) @@ -53,6 +60,16 @@ def load_files(nb_compagnie): return customer, campaigns_kpi, campaigns_brut, tickets, products +def remove_outlier_total_amount(tickets): + Q1 = tickets['total_amount'].quantile(0.25) + Q3 = tickets['total_amount'].quantile(0.75) + IQR = Q3 - Q1 + upper = Q3 +1.5*IQR + outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list() + tickets = tickets[~tickets['customer_id'].isin(outliers)] + return tickets + + def save_file_s3(File_name, type_of_activity): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') @@ -140,6 +157,19 @@ def maximum_price_paid(customer, type_of_activity): save_file_s3("Maximal_price_", type_of_activity) +def target_proportion(customer, type_of_activity): + df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum', + 'customer_id' : 'nunique'}).reset_index() + df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100 + plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"]) + plt.xlabel('Company Number') + plt.ylabel('Share (%)') + plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies') + plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]]) + plt.show() + save_file_s3("share_target_", type_of_activity) + + def mailing_consent(customer, type_of_activity): mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() mailing_consent["opt_in"] *= 100