From 23981e3cbcf858c7d0271a1baf004f238f1ac3b0 Mon Sep 17 00:00:00 2001
From: ajoubrel-ensae <antoine.joubrel@ensae.fr>
Date: Tue, 27 Feb 2024 21:01:20 +0000
Subject: [PATCH] Modification de la partie product purchased : ajout start et
 end date, open + cleaning de la base ticket_1 de l'entreprise 101

---
 0_1_Input_cleaning.py             |  44 +++++----
 0_2_Dataset_construction.py       |   2 +-
 0_Cleaning_and_merge_functions.py | 143 ++++++++++++++++++------------
 3 files changed, 112 insertions(+), 77 deletions(-)

diff --git a/0_1_Input_cleaning.py b/0_1_Input_cleaning.py
index 814698b..ec7eeeb 100644
--- a/0_1_Input_cleaning.py
+++ b/0_1_Input_cleaning.py
@@ -30,33 +30,43 @@ def export_dataset(df, output_name):
         df.to_csv(file_out, index = False)
 
 ## 1 - Cleaning of the datasets
-for tenant_id in ("101"): #"1",  "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", 
+for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"): 
+    
     # Timer
     start = time.time()
     
-    # Cleaning customerplus
-    df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
+    # # Cleaning customerplus
+    # df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
     
-    ## Exportation 
-    export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
+    # ## Exportation 
+    # export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
     
-    # Cleaning target area
-    df1_target_information = preprocessing_target_area(directory_path = tenant_id)
-    ## Exportation 
-    export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
+    # # Cleaning target area
+    # df1_target_information = preprocessing_target_area(directory_path = tenant_id)
+    # ## Exportation 
+    # export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
     
-    # Cleaning campaign area
-    df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
-    ## Exportation 
-    export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
+    # # Cleaning campaign area
+    # df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
+    # ## Exportation 
+    # export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
     
     ## Exportation 
     # export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv")
+
+    if tenant_id == "101":
+        # Cleaning product area
+        products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
+        # Exportation 
+        export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
+        export_dataset(df = products_purchased_reduced_1, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced_1.csv")
+    else :
+        # Cleaning product area
+        products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
+        # Exportation 
+        export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
+
     
-    # Cleaning product area
-    df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
-    ## Exportation 
-    export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
     #Exportation 
     # export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
     print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py
index 1561efa..2e9b9e0 100644
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@@ -13,7 +13,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 
 
-# Import cleaning and merge functions
+# Import KPI construction functions
 exec(open('0_KPI_functions.py').read())
 
 # Ignore warning
diff --git a/0_Cleaning_and_merge_functions.py b/0_Cleaning_and_merge_functions.py
index d2ddb86..c8144f5 100644
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@@ -79,48 +79,6 @@ def preprocessing_customerplus(directory_path):
 
     return customerplus_copy
 
-def preprocessing_tickets_area(directory_path):
-
-    # Datasets loading
-    tickets = load_dataset(directory_path, name = "tickets")
-    purchases = load_dataset(directory_path, name = "purchases")
-    suppliers = load_dataset(directory_path, name = "suppliers")
-    # type_ofs = load_dataset(directory_path, name = "type_ofs")
-    
-    # Base des tickets
-    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
-    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
-
-    # Base des fournisseurs
-    suppliers = suppliers[['id', 'name']]
-    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
-    suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
-
-    # Base des types de billets
-    # type_ofs = type_ofs[['id', 'name', 'children']]
-    # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
-
-    # Base des achats
-    # Nettoyage de la date d'achat
-    # cleaning_date(purchases, 'purchase_date')
-    # Selection des variables
-    purchases = purchases[['id', 'purchase_date', 'customer_id']]
-
-    # Fusions 
-    # Fusion avec fournisseurs
-    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
-    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
-    
-    # Fusion avec type de tickets
-    # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
-    # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
-    
-    # Fusion avec achats
-    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
-    ticket_information.drop(['id'], axis = 1, inplace=True)
-
-    return ticket_information
-
 def preprocessing_target_area(directory_path):
 
     # Datasets loading
@@ -169,6 +127,69 @@ def preprocessing_campaigns_area(directory_path):
 
     return campaigns_full
 
+def preprocessing_tickets_area(directory_path):
+
+    # Datasets loading
+    tickets = load_dataset(directory_path, name = "tickets")
+
+    # Supplementary tickets dataset for tenant 101
+    if directory_path == '101':
+        tickets_1 = load_dataset(directory_path, name = "tickets_1")
+    
+    purchases = load_dataset(directory_path, name = "purchases")
+    suppliers = load_dataset(directory_path, name = "suppliers")
+    # type_ofs = load_dataset(directory_path, name = "type_ofs")
+    
+    # Base des tickets
+    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
+    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
+    
+    if directory_path == '101':
+        tickets_1 = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
+        tickets_1.rename(columns = {'id' : 'ticket_id'}, inplace = True)
+
+    # Base des fournisseurs
+    suppliers = suppliers[['id', 'name']]
+    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
+    suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
+
+    # Base des types de billets
+    # type_ofs = type_ofs[['id', 'name', 'children']]
+    # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
+
+    # Base des achats
+    # Nettoyage de la date d'achat
+    # cleaning_date(purchases, 'purchase_date')
+    
+    # Selection des variables
+    purchases = purchases[['id', 'purchase_date', 'customer_id']]
+
+    # Fusions 
+    # Fusion avec fournisseurs
+    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
+    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
+    
+    # Fusion avec type de tickets
+    # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
+    # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
+    
+    # Fusion avec achats
+    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
+    ticket_information.drop(['id'], axis = 1, inplace=True)
+
+    if directory_path == '101':
+        # Fusion avec fournisseurs
+        ticket_information_1 = pd.merge(tickets_1, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
+        ticket_information_1.drop(['supplier_id', 'id'], axis = 1, inplace=True)
+        
+        # Fusion avec achats
+        ticket_information_1 = pd.merge(ticket_information_1, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
+        ticket_information_1.drop(['id'], axis = 1, inplace=True)
+
+        return ticket_information, ticket_information_1
+    else :
+        return ticket_information
+    
 def create_products_table(directory_path):
     # first merge products and categories
     print("first merge products and categories")
@@ -179,8 +200,7 @@ def create_products_table(directory_path):
     categories = categories.drop(columns = ['extra_field', 'quota'])
 
     #Merge
-    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
-                                    right_on = 'id', suffixes=('_products', '_categories'))
+    products_theme = products.merge(categories, how = 'left', left_on = 'category_id', right_on = 'id', suffixes=('_products', '_categories'))
     products_theme = products_theme.rename(columns = {"name" : "name_categories"})
     
     # Second merge products_theme and type of categories
@@ -195,7 +215,6 @@ def create_products_table(directory_path):
     products_theme  = order_columns_id(products_theme)
     return products_theme
 
-
 def create_events_table(directory_path):
     # first merge events and seasons : 
     print("first merge events and seasons : ")
@@ -233,16 +252,12 @@ def create_events_table(directory_path):
 
 def create_representations_table(directory_path):
     representations = load_dataset(directory_path, name = "representations")
-    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
-                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
-                                                     'representation_type_id'])
+    representations = representations.drop(columns = ['serial', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'name', 'representation_type_id']) # 'start_date_time', 'end_date_time', 'open'
     
     representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
     representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
 
-    representations_theme = representations.merge(representations_capacity, how='left',
-                                                  left_on='id', right_on='representation_id',
-                                                  suffixes=('_representation', '_representation_cap'))
+    representations_theme = representations.merge(representations_capacity, how='left', left_on='id', right_on='representation_id', suffixes=('_representation', '_representation_cap'))
     # index cleaning
     representations_theme = representations_theme.drop(columns = ["id_representation"])
     representations_theme = order_columns_id(representations_theme)
@@ -255,20 +270,30 @@ def uniform_product_df(directory_path):
     products_theme = create_products_table(directory_path)
     representation_theme = create_representations_table(directory_path)
     events_theme = create_events_table(directory_path)
-    ticket_information = preprocessing_tickets_area(directory_path)
+
+    if directory_path == '101':
+        ticket_information, ticket_information_1  = preprocessing_tickets_area(directory_path)
+    else : 
+        ticket_information = preprocessing_tickets_area(directory_path)
 
     print("Products theme columns : ", products_theme.columns)
     print("\n Representation theme columns : ", representation_theme.columns)
     print("\n Events theme columns : ", events_theme.columns)
 
-    products_global = pd.merge(products_theme, representation_theme, how='left',
-                                           on= ["representation_id", "category_id"])
+    products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"])
     
-    products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
-                                            suffixes = ("_representation", "_event"))
+    products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event"))
     
     products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
     
-    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date',  'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] # 'type_of_ticket_name', 'children',
-
-    return products_purchased_reduced
\ No newline at end of file
+    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date',  'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
+    
+    if directory_path == '101':
+        products_purchased_1 = pd.merge(ticket_information_1, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
+        
+        products_purchased_reduced_1 = products_purchased_1[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date',  'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
+        
+        return products_purchased_reduced, products_purchased_reduced_1
+        
+    else : 
+        return products_purchased_reduced
\ No newline at end of file