From 52fd738fe512c4c1f25ca81619d59c8c86886a0b Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 21 Mar 2024 10:47:40 +0000
Subject: [PATCH] fix errors

---
 0_4_Generate_stat_desc.py |  4 +++-
 0_5_Machine_Learning.py   |  8 ++++----
 utils_stat_desc.py        | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/0_4_Generate_stat_desc.py b/0_4_Generate_stat_desc.py
index c0821e0..160e568 100644
--- a/0_4_Generate_stat_desc.py
+++ b/0_4_Generate_stat_desc.py
@@ -47,7 +47,9 @@ customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(
 # Generate graph and automatically saved them in the bucket
 compute_nb_clients(customer, type_of_activity)
 
-maximum_price_paid(customer, type_of_activity)
+#maximum_price_paid(customer, type_of_activity)
+
+target_proportion(customer, type_of_activity)
 
 mailing_consent(customer, type_of_activity)
 
diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py
index f6c162b..4e43afd 100644
--- a/0_5_Machine_Learning.py
+++ b/0_5_Machine_Learning.py
@@ -55,16 +55,16 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =
 weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
 
 
-numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
-            'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
-            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
+numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
+                    'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet',
+                    'nb_campaigns', 'nb_campaigns_opened']
 
 numeric_transformer = Pipeline(steps=[
     #("imputer", SimpleImputer(strategy="mean")),  
     ("scaler", StandardScaler()) 
 ])
 
-categorical_features = ['opt_in']  
+categorical_features = ['opt_in', 'gender_male', 'gender_female']  
 
 # Transformer for the categorical features
 categorical_transformer = Pipeline(steps=[
diff --git a/utils_stat_desc.py b/utils_stat_desc.py
index 6372c63..469569a 100644
--- a/utils_stat_desc.py
+++ b/utils_stat_desc.py
@@ -42,7 +42,14 @@ def load_files(nb_compagnie):
         df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
         df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
         df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
-    
+        
+    # Remove companies' outliers
+        df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
+    # harmonize set of customers across databases
+        customer_id = df_tickets_kpi['customer_id'].to_list()
+        for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
+            dataset = dataset[dataset['customer_id'].isin(customer_id)]
+        
     # Concaténation
         customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
         campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
@@ -53,6 +60,16 @@ def load_files(nb_compagnie):
     return customer, campaigns_kpi, campaigns_brut, tickets, products
 
 
+def remove_outlier_total_amount(tickets):
+    Q1 = tickets['total_amount'].quantile(0.25)
+    Q3 = tickets['total_amount'].quantile(0.75)
+    IQR = Q3 - Q1
+    upper =  Q3 +1.5*IQR
+    outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
+    tickets = tickets[~tickets['customer_id'].isin(outliers)]
+    return tickets
+    
+
 def save_file_s3(File_name, type_of_activity):
     image_buffer = io.BytesIO()
     plt.savefig(image_buffer, format='png')
@@ -140,6 +157,19 @@ def maximum_price_paid(customer, type_of_activity):
     save_file_s3("Maximal_price_", type_of_activity)
 
 
+def target_proportion(customer, type_of_activity):
+    df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
+                                                 'customer_id' : 'nunique'}).reset_index()
+    df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
+    plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
+    plt.xlabel('Company Number')
+    plt.ylabel('Share (%)')
+    plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
+    plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
+    plt.show()
+    save_file_s3("share_target_", type_of_activity)
+
+
 def mailing_consent(customer, type_of_activity):
     mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
     mailing_consent["opt_in"] *= 100