Correction erreur de construction des datasets de modelisation

This commit is contained in:
Antoine JOUBREL 2024-03-10 15:30:23 +00:00
parent 27ef78a486
commit f40ae6ead0
2 changed files with 12 additions and 8 deletions

View File

@ -7,6 +7,8 @@ import s3fs
import re import re
import warnings import warnings
from datetime import date, timedelta, datetime from datetime import date, timedelta, datetime
from sklearn.model_selection import train_test_split
# Create filesystem object # Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
@ -172,15 +174,17 @@ for company in list_of_comp:
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]] dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
#train test set # #train test set
np.random.seed(42) # np.random.seed(42)
split_ratio = 0.7 # split_ratio = 0.7
split_index = int(len(dataset) * split_ratio) # split_index = int(len(dataset) * split_ratio)
dataset = dataset.sample(frac=1).reset_index(drop=True) # dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset_train = dataset.iloc[:split_index] # dataset_train = dataset.iloc[:split_index]
dataset_test = dataset.iloc[split_index:] # dataset_test = dataset.iloc[split_index:]
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
# Dataset Test # Dataset Test
# Exportation # Exportation
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv" FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"

View File

@ -38,7 +38,7 @@ def generate_train_set(type_of_comp):
print(file) print(file)
with fs.open(file, mode="rb") as file_in: with fs.open(file, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",") df = pd.read_csv(file_in, sep=",")
train_set = pd.concat([test_set, df], ignore_index = True) train_set = pd.concat([train_set, df], ignore_index = True)
return train_set return train_set