diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index f88952b..a27b08b 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -6,6 +6,7 @@ import os import s3fs import re import warnings +from datetime import date, timedelta, datetime # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] @@ -18,6 +19,47 @@ exec(open('0_KPI_functions.py').read()) # Ignore warning warnings.filterwarnings('ignore') + +def display_covering_time(df, company, datecover): + """ + This function draws the time coverage of each company + """ + min_date = df['purchase_date'].min().strftime("%Y-%m-%d") + max_date = df['purchase_date'].max().strftime("%Y-%m-%d") + datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)] + print(f'Couverture Company {company} : {min_date} - {max_date}') + return datecover + + +def compute_time_intersection(datecover): + """ + This function returns the time coverage for all companies + """ + timestamps_sets = [set(timestamps) for timestamps in datecover.values()] + intersection = set.intersection(*timestamps_sets) + intersection_list = list(intersection) + formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list] + return sorted(formated_dates) + + +def df_coverage_modelization(sport, coverage_train = 0.7): + """ + This function returns start_date, end_of_features and final dates + that help to construct train and test datasets + """ + datecover = {} + for company in sport: + df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced", + datetime_col = ['purchase_date']) + datecover = display_covering_time(df_products_purchased_reduced, company, datecover) + #print(datecover.keys()) + dt_coverage = compute_time_intersection(datecover) + start_date = dt_coverage[0] + end_of_features = dt_coverage[int(0.7 * len(dt_coverage))] + final_date = dt_coverage[-1] + return start_date, end_of_features, final_date + + def dataset_construction(min_date, end_features_date, max_date, directory_path): # Import customerplus @@ -97,32 +139,43 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): ## Exportation +companies = {'musee' : ['1', '2', '3', '4', '101'], + 'sport': ['5', '6', '7', '8', '9'], + 'musique' : ['10', '11', '12', '13', '14']} + +type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +list_of_comp = companies[type_of_comp] # Dossier d'exportation -BUCKET_OUT = "projet-bdc2324-team1/2_Output/Logistique Regression databases - First approach" +BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}' -# Dataset test -dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1") +# Create test dataset and train dataset for sport companies -# Exportation -FILE_KEY_OUT_S3 = "dataset_test.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 +start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7) -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - dataset_test.to_csv(file_out, index = False) +for company in list_of_comp: + dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features, + max_date = final_date, directory_path = company) -print("Exportation dataset test : SUCCESS") + # Exportation + FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv" + FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 + + with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + dataset_test.to_csv(file_out, index = False) + + print("Exportation dataset test : SUCCESS") # Dataset train -dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1") - -# Export -FILE_KEY_OUT_S3 = "dataset_train.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 - -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - dataset_train.to_csv(file_out, index = False) + dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features, + max_date = final_date, directory_path = company) + # Export + FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" + FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 -print("Exportation dataset train : SUCCESS") + with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + dataset_train.to_csv(file_out, index = False) + + print("Exportation dataset train : SUCCESS") print("FIN DE LA GENERATION DES DATASETS : SUCCESS")