From 23981e3cbcf858c7d0271a1baf004f238f1ac3b0 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Tue, 27 Feb 2024 21:01:20 +0000 Subject: [PATCH] Modification de la partie product purchased : ajout start et end date, open + cleaning de la base ticket_1 de l'entreprise 101 --- 0_1_Input_cleaning.py | 44 +++++---- 0_2_Dataset_construction.py | 2 +- 0_Cleaning_and_merge_functions.py | 143 ++++++++++++++++++------------ 3 files changed, 112 insertions(+), 77 deletions(-) diff --git a/0_1_Input_cleaning.py b/0_1_Input_cleaning.py index 814698b..ec7eeeb 100644 --- a/0_1_Input_cleaning.py +++ b/0_1_Input_cleaning.py @@ -30,33 +30,43 @@ def export_dataset(df, output_name): df.to_csv(file_out, index = False) ## 1 - Cleaning of the datasets -for tenant_id in ("101"): #"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", +for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"): + # Timer start = time.time() - # Cleaning customerplus - df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id) + # # Cleaning customerplus + # df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id) - ## Exportation - export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv") + # ## Exportation + # export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv") - # Cleaning target area - df1_target_information = preprocessing_target_area(directory_path = tenant_id) - ## Exportation - export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv") + # # Cleaning target area + # df1_target_information = preprocessing_target_area(directory_path = tenant_id) + # ## Exportation + # export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv") - # Cleaning campaign area - df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id) - ## Exportation - export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv") + # # Cleaning campaign area + # df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id) + # ## Exportation + # export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv") ## Exportation # export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv") + + if tenant_id == "101": + # Cleaning product area + products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id) + # Exportation + export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv") + export_dataset(df = products_purchased_reduced_1, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced_1.csv") + else : + # Cleaning product area + products_purchased_reduced = uniform_product_df(directory_path = tenant_id) + # Exportation + export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv") + - # Cleaning product area - df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id) - ## Exportation - export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv") #Exportation # export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv") print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start) diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index 1561efa..2e9b9e0 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -13,7 +13,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) -# Import cleaning and merge functions +# Import KPI construction functions exec(open('0_KPI_functions.py').read()) # Ignore warning diff --git a/0_Cleaning_and_merge_functions.py b/0_Cleaning_and_merge_functions.py index d2ddb86..c8144f5 100644 --- a/0_Cleaning_and_merge_functions.py +++ b/0_Cleaning_and_merge_functions.py @@ -79,48 +79,6 @@ def preprocessing_customerplus(directory_path): return customerplus_copy -def preprocessing_tickets_area(directory_path): - - # Datasets loading - tickets = load_dataset(directory_path, name = "tickets") - purchases = load_dataset(directory_path, name = "purchases") - suppliers = load_dataset(directory_path, name = "suppliers") - # type_ofs = load_dataset(directory_path, name = "type_ofs") - - # Base des tickets - tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']] - tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True) - - # Base des fournisseurs - suppliers = suppliers[['id', 'name']] - suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True) - suppliers['supplier_name'] = suppliers['supplier_name'].fillna('') - - # Base des types de billets - # type_ofs = type_ofs[['id', 'name', 'children']] - # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True) - - # Base des achats - # Nettoyage de la date d'achat - # cleaning_date(purchases, 'purchase_date') - # Selection des variables - purchases = purchases[['id', 'purchase_date', 'customer_id']] - - # Fusions - # Fusion avec fournisseurs - ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner') - ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True) - - # Fusion avec type de tickets - # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner') - # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True) - - # Fusion avec achats - ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner') - ticket_information.drop(['id'], axis = 1, inplace=True) - - return ticket_information - def preprocessing_target_area(directory_path): # Datasets loading @@ -169,6 +127,69 @@ def preprocessing_campaigns_area(directory_path): return campaigns_full +def preprocessing_tickets_area(directory_path): + + # Datasets loading + tickets = load_dataset(directory_path, name = "tickets") + + # Supplementary tickets dataset for tenant 101 + if directory_path == '101': + tickets_1 = load_dataset(directory_path, name = "tickets_1") + + purchases = load_dataset(directory_path, name = "purchases") + suppliers = load_dataset(directory_path, name = "suppliers") + # type_ofs = load_dataset(directory_path, name = "type_ofs") + + # Base des tickets + tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']] + tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True) + + if directory_path == '101': + tickets_1 = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']] + tickets_1.rename(columns = {'id' : 'ticket_id'}, inplace = True) + + # Base des fournisseurs + suppliers = suppliers[['id', 'name']] + suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True) + suppliers['supplier_name'] = suppliers['supplier_name'].fillna('') + + # Base des types de billets + # type_ofs = type_ofs[['id', 'name', 'children']] + # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True) + + # Base des achats + # Nettoyage de la date d'achat + # cleaning_date(purchases, 'purchase_date') + + # Selection des variables + purchases = purchases[['id', 'purchase_date', 'customer_id']] + + # Fusions + # Fusion avec fournisseurs + ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner') + ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True) + + # Fusion avec type de tickets + # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner') + # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True) + + # Fusion avec achats + ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner') + ticket_information.drop(['id'], axis = 1, inplace=True) + + if directory_path == '101': + # Fusion avec fournisseurs + ticket_information_1 = pd.merge(tickets_1, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner') + ticket_information_1.drop(['supplier_id', 'id'], axis = 1, inplace=True) + + # Fusion avec achats + ticket_information_1 = pd.merge(ticket_information_1, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner') + ticket_information_1.drop(['id'], axis = 1, inplace=True) + + return ticket_information, ticket_information_1 + else : + return ticket_information + def create_products_table(directory_path): # first merge products and categories print("first merge products and categories") @@ -179,8 +200,7 @@ def create_products_table(directory_path): categories = categories.drop(columns = ['extra_field', 'quota']) #Merge - products_theme = products.merge(categories, how = 'left', left_on = 'category_id', - right_on = 'id', suffixes=('_products', '_categories')) + products_theme = products.merge(categories, how = 'left', left_on = 'category_id', right_on = 'id', suffixes=('_products', '_categories')) products_theme = products_theme.rename(columns = {"name" : "name_categories"}) # Second merge products_theme and type of categories @@ -195,7 +215,6 @@ def create_products_table(directory_path): products_theme = order_columns_id(products_theme) return products_theme - def create_events_table(directory_path): # first merge events and seasons : print("first merge events and seasons : ") @@ -233,16 +252,12 @@ def create_events_table(directory_path): def create_representations_table(directory_path): representations = load_dataset(directory_path, name = "representations") - representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling', - 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name', - 'representation_type_id']) + representations = representations.drop(columns = ['serial', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'name', 'representation_type_id']) # 'start_date_time', 'end_date_time', 'open' representations_capacity = load_dataset(directory_path, name = "representation_category_capacities") representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling']) - representations_theme = representations.merge(representations_capacity, how='left', - left_on='id', right_on='representation_id', - suffixes=('_representation', '_representation_cap')) + representations_theme = representations.merge(representations_capacity, how='left', left_on='id', right_on='representation_id', suffixes=('_representation', '_representation_cap')) # index cleaning representations_theme = representations_theme.drop(columns = ["id_representation"]) representations_theme = order_columns_id(representations_theme) @@ -255,20 +270,30 @@ def uniform_product_df(directory_path): products_theme = create_products_table(directory_path) representation_theme = create_representations_table(directory_path) events_theme = create_events_table(directory_path) - ticket_information = preprocessing_tickets_area(directory_path) + + if directory_path == '101': + ticket_information, ticket_information_1 = preprocessing_tickets_area(directory_path) + else : + ticket_information = preprocessing_tickets_area(directory_path) print("Products theme columns : ", products_theme.columns) print("\n Representation theme columns : ", representation_theme.columns) print("\n Events theme columns : ", events_theme.columns) - products_global = pd.merge(products_theme, representation_theme, how='left', - on= ["representation_id", "category_id"]) + products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"]) - products_global = pd.merge(products_global, events_theme, how='left', on='event_id', - suffixes = ("_representation", "_event")) + products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event")) products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner') - products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] # 'type_of_ticket_name', 'children', - - return products_purchased_reduced \ No newline at end of file + products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children', + + if directory_path == '101': + products_purchased_1 = pd.merge(ticket_information_1, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner') + + products_purchased_reduced_1 = products_purchased_1[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children', + + return products_purchased_reduced, products_purchased_reduced_1 + + else : + return products_purchased_reduced \ No newline at end of file