diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index 75c8259..0ceb67a 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -7,6 +7,8 @@ import s3fs import re import warnings from datetime import date, timedelta, datetime +from sklearn.model_selection import train_test_split + # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] @@ -172,15 +174,17 @@ for company in list_of_comp: dataset = dataset[dataset['customer_id'] != anonymous_customer[company]] - #train test set - np.random.seed(42) + # #train test set + # np.random.seed(42) - split_ratio = 0.7 - split_index = int(len(dataset) * split_ratio) - dataset = dataset.sample(frac=1).reset_index(drop=True) - dataset_train = dataset.iloc[:split_index] - dataset_test = dataset.iloc[split_index:] + # split_ratio = 0.7 + # split_index = int(len(dataset) * split_ratio) + # dataset = dataset.sample(frac=1).reset_index(drop=True) + # dataset_train = dataset.iloc[:split_index] + # dataset_test = dataset.iloc[split_index:] + dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42) + # Dataset Test # Exportation FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv" diff --git a/0_3_General_modelization_dataset.py b/0_3_General_modelization_dataset.py index 2ba6a75..2feb2a0 100644 --- a/0_3_General_modelization_dataset.py +++ b/0_3_General_modelization_dataset.py @@ -38,7 +38,7 @@ def generate_train_set(type_of_comp): print(file) with fs.open(file, mode="rb") as file_in: df = pd.read_csv(file_in, sep=",") - train_set = pd.concat([test_set, df], ignore_index = True) + train_set = pd.concat([train_set, df], ignore_index = True) return train_set