Update
This commit is contained in:
parent
29eafcc6b2
commit
c26b5b11d8
|
@ -6,6 +6,7 @@ import os
|
||||||
import s3fs
|
import s3fs
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
import time
|
||||||
|
|
||||||
# Create filesystem object
|
# Create filesystem object
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
@ -20,15 +21,19 @@ BUCKET_OUT = "projet-bdc2324-team1"
|
||||||
# Ignore warning
|
# Ignore warning
|
||||||
warnings.filterwarnings('ignore')
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
start_all = time.time()
|
||||||
|
|
||||||
def export_dataset(df, output_name):
|
def export_dataset(df, output_name):
|
||||||
print('Exportation of dataset :', output_name)
|
print('Export of dataset :', output_name)
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
df.to_csv(file_out, index = False)
|
df.to_csv(file_out, index = False)
|
||||||
|
|
||||||
## 1 - Cleaning of the datasets
|
## 1 - Cleaning of the datasets
|
||||||
for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"):
|
for tenant_id in ("101"): #"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
|
||||||
|
# Timer
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
# Cleaning customerplus
|
# Cleaning customerplus
|
||||||
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
|
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
|
||||||
|
|
||||||
|
@ -46,13 +51,15 @@ for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
|
||||||
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
||||||
|
|
||||||
## Exportation
|
## Exportation
|
||||||
# export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
|
# export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv")
|
||||||
|
|
||||||
# Cleaning product area
|
# Cleaning product area
|
||||||
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
|
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
|
||||||
## Exportation
|
## Exportation
|
||||||
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
||||||
#Exportation
|
#Exportation
|
||||||
# export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
|
# export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
|
||||||
|
print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
|
||||||
print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
|
print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
|
||||||
|
|
||||||
|
print("Time to run the cleaning of all used datasets : " , time.time() - start_all)
|
|
@ -98,31 +98,31 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
## Exportation
|
## Exportation
|
||||||
|
|
||||||
# Dossier d'exportation
|
# Dossier d'exportation
|
||||||
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
|
BUCKET_OUT = "projet-bdc2324-team1/2_Output/Logistique Regression databases - First approach"
|
||||||
|
|
||||||
# Dataset test
|
# Dataset test
|
||||||
dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
|
dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
|
||||||
|
|
||||||
# # Exportation
|
# Exportation
|
||||||
# FILE_KEY_OUT_S3 = "dataset_test.csv"
|
FILE_KEY_OUT_S3 = "dataset_test.csv"
|
||||||
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
# dataset_test.to_csv(file_out, index = False)
|
dataset_test.to_csv(file_out, index = False)
|
||||||
|
|
||||||
# print("Exportation dataset test : SUCCESS")
|
print("Exportation dataset test : SUCCESS")
|
||||||
|
|
||||||
# Dataset train
|
# Dataset train
|
||||||
dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
|
dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
|
||||||
|
|
||||||
# Exportation
|
# Export
|
||||||
# FILE_KEY_OUT_S3 = "dataset_train.csv"
|
FILE_KEY_OUT_S3 = "dataset_train.csv"
|
||||||
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
# dataset_train.to_csv(file_out, index = False)
|
dataset_train.to_csv(file_out, index = False)
|
||||||
|
|
||||||
# print("Exportation dataset train : SUCCESS")
|
print("Exportation dataset train : SUCCESS")
|
||||||
|
|
||||||
|
|
||||||
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
||||||
|
|
|
@ -85,7 +85,7 @@ def preprocessing_tickets_area(directory_path):
|
||||||
tickets = load_dataset(directory_path, name = "tickets")
|
tickets = load_dataset(directory_path, name = "tickets")
|
||||||
purchases = load_dataset(directory_path, name = "purchases")
|
purchases = load_dataset(directory_path, name = "purchases")
|
||||||
suppliers = load_dataset(directory_path, name = "suppliers")
|
suppliers = load_dataset(directory_path, name = "suppliers")
|
||||||
type_ofs = load_dataset(directory_path, name = "type_ofs")
|
# type_ofs = load_dataset(directory_path, name = "type_ofs")
|
||||||
|
|
||||||
# Base des tickets
|
# Base des tickets
|
||||||
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
||||||
|
@ -97,8 +97,8 @@ def preprocessing_tickets_area(directory_path):
|
||||||
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
|
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
|
||||||
|
|
||||||
# Base des types de billets
|
# Base des types de billets
|
||||||
type_ofs = type_ofs[['id', 'name', 'children']]
|
# type_ofs = type_ofs[['id', 'name', 'children']]
|
||||||
type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
|
# type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
|
||||||
|
|
||||||
# Base des achats
|
# Base des achats
|
||||||
# Nettoyage de la date d'achat
|
# Nettoyage de la date d'achat
|
||||||
|
@ -112,8 +112,8 @@ def preprocessing_tickets_area(directory_path):
|
||||||
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
|
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
# Fusion avec type de tickets
|
# Fusion avec type de tickets
|
||||||
ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
|
# ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
|
||||||
ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
|
# ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
|
||||||
|
|
||||||
# Fusion avec achats
|
# Fusion avec achats
|
||||||
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
|
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
|
||||||
|
@ -184,11 +184,11 @@ def create_products_table(directory_path):
|
||||||
products_theme = products_theme.rename(columns = {"name" : "name_categories"})
|
products_theme = products_theme.rename(columns = {"name" : "name_categories"})
|
||||||
|
|
||||||
# Second merge products_theme and type of categories
|
# Second merge products_theme and type of categories
|
||||||
print("Second merge products_theme and type of categories")
|
# print("Second merge products_theme and type of categories")
|
||||||
type_of_categories = load_dataset(directory_path, name = "type_of_categories")
|
# type_of_categories = load_dataset(directory_path, name = "type_of_categories")
|
||||||
type_of_categories = type_of_categories.drop(columns = 'id')
|
# type_of_categories = type_of_categories.drop(columns = 'id')
|
||||||
products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
|
# products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
|
||||||
right_on = 'category_id' )
|
# right_on = 'category_id' )
|
||||||
|
|
||||||
# Index cleaning
|
# Index cleaning
|
||||||
products_theme = products_theme.drop(columns = ['id_categories'])
|
products_theme = products_theme.drop(columns = ['id_categories'])
|
||||||
|
@ -269,6 +269,6 @@ def uniform_product_df(directory_path):
|
||||||
|
|
||||||
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
||||||
|
|
||||||
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
|
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] # 'type_of_ticket_name', 'children',
|
||||||
|
|
||||||
return products_purchased_reduced
|
return products_purchased_reduced
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user