Modification de la partie product purchased : ajout start et end date, open + cleaning de la base ticket_1 de l'entreprise 101
This commit is contained in:
parent
d0c980f788
commit
23981e3cbc
|
@ -30,33 +30,43 @@ def export_dataset(df, output_name):
|
||||||
df.to_csv(file_out, index = False)
|
df.to_csv(file_out, index = False)
|
||||||
|
|
||||||
## 1 - Cleaning of the datasets
|
## 1 - Cleaning of the datasets
|
||||||
for tenant_id in ("101"): #"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
|
for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"):
|
||||||
|
|
||||||
# Timer
|
# Timer
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
# Cleaning customerplus
|
# # Cleaning customerplus
|
||||||
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
|
# df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
|
||||||
|
|
||||||
## Exportation
|
# ## Exportation
|
||||||
export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
|
# export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
|
||||||
|
|
||||||
# Cleaning target area
|
# # Cleaning target area
|
||||||
df1_target_information = preprocessing_target_area(directory_path = tenant_id)
|
# df1_target_information = preprocessing_target_area(directory_path = tenant_id)
|
||||||
## Exportation
|
# ## Exportation
|
||||||
export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
|
# export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
|
||||||
|
|
||||||
# Cleaning campaign area
|
# # Cleaning campaign area
|
||||||
df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
|
# df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
|
||||||
## Exportation
|
# ## Exportation
|
||||||
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
# export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
||||||
|
|
||||||
## Exportation
|
## Exportation
|
||||||
# export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv")
|
# export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv")
|
||||||
|
|
||||||
# Cleaning product area
|
if tenant_id == "101":
|
||||||
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
|
# Cleaning product area
|
||||||
## Exportation
|
products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
|
||||||
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
# Exportation
|
||||||
|
export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
||||||
|
export_dataset(df = products_purchased_reduced_1, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced_1.csv")
|
||||||
|
else :
|
||||||
|
# Cleaning product area
|
||||||
|
products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
|
||||||
|
# Exportation
|
||||||
|
export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
||||||
|
|
||||||
|
|
||||||
#Exportation
|
#Exportation
|
||||||
# export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
|
# export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
|
||||||
print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
|
print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
|
||||||
|
|
|
@ -13,7 +13,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
# Import cleaning and merge functions
|
# Import KPI construction functions
|
||||||
exec(open('0_KPI_functions.py').read())
|
exec(open('0_KPI_functions.py').read())
|
||||||
|
|
||||||
# Ignore warning
|
# Ignore warning
|
||||||
|
|
|
@ -79,48 +79,6 @@ def preprocessing_customerplus(directory_path):
|
||||||
|
|
||||||
return customerplus_copy
|
return customerplus_copy
|
||||||
|
|
||||||
def preprocessing_tickets_area(directory_path):
|
|
||||||
|
|
||||||
# Datasets loading
|
|
||||||
tickets = load_dataset(directory_path, name = "tickets")
|
|
||||||
purchases = load_dataset(directory_path, name = "purchases")
|
|
||||||
suppliers = load_dataset(directory_path, name = "suppliers")
|
|
||||||
# type_ofs = load_dataset(directory_path, name = "type_ofs")
|
|
||||||
|
|
||||||
# Base des tickets
|
|
||||||
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
|
||||||
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
|
|
||||||
|
|
||||||
# Base des fournisseurs
|
|
||||||
suppliers = suppliers[['id', 'name']]
|
|
||||||
suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
|
|
||||||
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
|
|
||||||
|
|
||||||
# Base des types de billets
|
|
||||||
# type_ofs = type_ofs[['id', 'name', 'children']]
|
|
||||||
# type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
|
|
||||||
|
|
||||||
# Base des achats
|
|
||||||
# Nettoyage de la date d'achat
|
|
||||||
# cleaning_date(purchases, 'purchase_date')
|
|
||||||
# Selection des variables
|
|
||||||
purchases = purchases[['id', 'purchase_date', 'customer_id']]
|
|
||||||
|
|
||||||
# Fusions
|
|
||||||
# Fusion avec fournisseurs
|
|
||||||
ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
|
|
||||||
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
|
|
||||||
|
|
||||||
# Fusion avec type de tickets
|
|
||||||
# ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
|
|
||||||
# ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
|
|
||||||
|
|
||||||
# Fusion avec achats
|
|
||||||
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
|
|
||||||
ticket_information.drop(['id'], axis = 1, inplace=True)
|
|
||||||
|
|
||||||
return ticket_information
|
|
||||||
|
|
||||||
def preprocessing_target_area(directory_path):
|
def preprocessing_target_area(directory_path):
|
||||||
|
|
||||||
# Datasets loading
|
# Datasets loading
|
||||||
|
@ -169,6 +127,69 @@ def preprocessing_campaigns_area(directory_path):
|
||||||
|
|
||||||
return campaigns_full
|
return campaigns_full
|
||||||
|
|
||||||
|
def preprocessing_tickets_area(directory_path):
|
||||||
|
|
||||||
|
# Datasets loading
|
||||||
|
tickets = load_dataset(directory_path, name = "tickets")
|
||||||
|
|
||||||
|
# Supplementary tickets dataset for tenant 101
|
||||||
|
if directory_path == '101':
|
||||||
|
tickets_1 = load_dataset(directory_path, name = "tickets_1")
|
||||||
|
|
||||||
|
purchases = load_dataset(directory_path, name = "purchases")
|
||||||
|
suppliers = load_dataset(directory_path, name = "suppliers")
|
||||||
|
# type_ofs = load_dataset(directory_path, name = "type_ofs")
|
||||||
|
|
||||||
|
# Base des tickets
|
||||||
|
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
||||||
|
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
|
||||||
|
|
||||||
|
if directory_path == '101':
|
||||||
|
tickets_1 = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
||||||
|
tickets_1.rename(columns = {'id' : 'ticket_id'}, inplace = True)
|
||||||
|
|
||||||
|
# Base des fournisseurs
|
||||||
|
suppliers = suppliers[['id', 'name']]
|
||||||
|
suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
|
||||||
|
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
|
||||||
|
|
||||||
|
# Base des types de billets
|
||||||
|
# type_ofs = type_ofs[['id', 'name', 'children']]
|
||||||
|
# type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
|
||||||
|
|
||||||
|
# Base des achats
|
||||||
|
# Nettoyage de la date d'achat
|
||||||
|
# cleaning_date(purchases, 'purchase_date')
|
||||||
|
|
||||||
|
# Selection des variables
|
||||||
|
purchases = purchases[['id', 'purchase_date', 'customer_id']]
|
||||||
|
|
||||||
|
# Fusions
|
||||||
|
# Fusion avec fournisseurs
|
||||||
|
ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
|
||||||
|
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
|
# Fusion avec type de tickets
|
||||||
|
# ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
|
||||||
|
# ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
|
# Fusion avec achats
|
||||||
|
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
|
||||||
|
ticket_information.drop(['id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
|
if directory_path == '101':
|
||||||
|
# Fusion avec fournisseurs
|
||||||
|
ticket_information_1 = pd.merge(tickets_1, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
|
||||||
|
ticket_information_1.drop(['supplier_id', 'id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
|
# Fusion avec achats
|
||||||
|
ticket_information_1 = pd.merge(ticket_information_1, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
|
||||||
|
ticket_information_1.drop(['id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
|
return ticket_information, ticket_information_1
|
||||||
|
else :
|
||||||
|
return ticket_information
|
||||||
|
|
||||||
def create_products_table(directory_path):
|
def create_products_table(directory_path):
|
||||||
# first merge products and categories
|
# first merge products and categories
|
||||||
print("first merge products and categories")
|
print("first merge products and categories")
|
||||||
|
@ -179,8 +200,7 @@ def create_products_table(directory_path):
|
||||||
categories = categories.drop(columns = ['extra_field', 'quota'])
|
categories = categories.drop(columns = ['extra_field', 'quota'])
|
||||||
|
|
||||||
#Merge
|
#Merge
|
||||||
products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
|
products_theme = products.merge(categories, how = 'left', left_on = 'category_id', right_on = 'id', suffixes=('_products', '_categories'))
|
||||||
right_on = 'id', suffixes=('_products', '_categories'))
|
|
||||||
products_theme = products_theme.rename(columns = {"name" : "name_categories"})
|
products_theme = products_theme.rename(columns = {"name" : "name_categories"})
|
||||||
|
|
||||||
# Second merge products_theme and type of categories
|
# Second merge products_theme and type of categories
|
||||||
|
@ -195,7 +215,6 @@ def create_products_table(directory_path):
|
||||||
products_theme = order_columns_id(products_theme)
|
products_theme = order_columns_id(products_theme)
|
||||||
return products_theme
|
return products_theme
|
||||||
|
|
||||||
|
|
||||||
def create_events_table(directory_path):
|
def create_events_table(directory_path):
|
||||||
# first merge events and seasons :
|
# first merge events and seasons :
|
||||||
print("first merge events and seasons : ")
|
print("first merge events and seasons : ")
|
||||||
|
@ -233,16 +252,12 @@ def create_events_table(directory_path):
|
||||||
|
|
||||||
def create_representations_table(directory_path):
|
def create_representations_table(directory_path):
|
||||||
representations = load_dataset(directory_path, name = "representations")
|
representations = load_dataset(directory_path, name = "representations")
|
||||||
representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
|
representations = representations.drop(columns = ['serial', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'name', 'representation_type_id']) # 'start_date_time', 'end_date_time', 'open'
|
||||||
'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
|
|
||||||
'representation_type_id'])
|
|
||||||
|
|
||||||
representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
|
representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
|
||||||
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
|
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
|
||||||
|
|
||||||
representations_theme = representations.merge(representations_capacity, how='left',
|
representations_theme = representations.merge(representations_capacity, how='left', left_on='id', right_on='representation_id', suffixes=('_representation', '_representation_cap'))
|
||||||
left_on='id', right_on='representation_id',
|
|
||||||
suffixes=('_representation', '_representation_cap'))
|
|
||||||
# index cleaning
|
# index cleaning
|
||||||
representations_theme = representations_theme.drop(columns = ["id_representation"])
|
representations_theme = representations_theme.drop(columns = ["id_representation"])
|
||||||
representations_theme = order_columns_id(representations_theme)
|
representations_theme = order_columns_id(representations_theme)
|
||||||
|
@ -255,20 +270,30 @@ def uniform_product_df(directory_path):
|
||||||
products_theme = create_products_table(directory_path)
|
products_theme = create_products_table(directory_path)
|
||||||
representation_theme = create_representations_table(directory_path)
|
representation_theme = create_representations_table(directory_path)
|
||||||
events_theme = create_events_table(directory_path)
|
events_theme = create_events_table(directory_path)
|
||||||
ticket_information = preprocessing_tickets_area(directory_path)
|
|
||||||
|
if directory_path == '101':
|
||||||
|
ticket_information, ticket_information_1 = preprocessing_tickets_area(directory_path)
|
||||||
|
else :
|
||||||
|
ticket_information = preprocessing_tickets_area(directory_path)
|
||||||
|
|
||||||
print("Products theme columns : ", products_theme.columns)
|
print("Products theme columns : ", products_theme.columns)
|
||||||
print("\n Representation theme columns : ", representation_theme.columns)
|
print("\n Representation theme columns : ", representation_theme.columns)
|
||||||
print("\n Events theme columns : ", events_theme.columns)
|
print("\n Events theme columns : ", events_theme.columns)
|
||||||
|
|
||||||
products_global = pd.merge(products_theme, representation_theme, how='left',
|
products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"])
|
||||||
on= ["representation_id", "category_id"])
|
|
||||||
|
|
||||||
products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
|
products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event"))
|
||||||
suffixes = ("_representation", "_event"))
|
|
||||||
|
|
||||||
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
||||||
|
|
||||||
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] # 'type_of_ticket_name', 'children',
|
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
|
||||||
|
|
||||||
return products_purchased_reduced
|
if directory_path == '101':
|
||||||
|
products_purchased_1 = pd.merge(ticket_information_1, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
||||||
|
|
||||||
|
products_purchased_reduced_1 = products_purchased_1[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
|
||||||
|
|
||||||
|
return products_purchased_reduced, products_purchased_reduced_1
|
||||||
|
|
||||||
|
else :
|
||||||
|
return products_purchased_reduced
|
Loading…
Reference in New Issue
Block a user