Correction erreur de construction des datasets de modelisation
This commit is contained in:
parent
27ef78a486
commit
f40ae6ead0
|
@ -7,6 +7,8 @@ import s3fs
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from datetime import date, timedelta, datetime
|
from datetime import date, timedelta, datetime
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
# Create filesystem object
|
# Create filesystem object
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
@ -172,15 +174,17 @@ for company in list_of_comp:
|
||||||
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
||||||
|
|
||||||
|
|
||||||
#train test set
|
# #train test set
|
||||||
np.random.seed(42)
|
# np.random.seed(42)
|
||||||
|
|
||||||
split_ratio = 0.7
|
# split_ratio = 0.7
|
||||||
split_index = int(len(dataset) * split_ratio)
|
# split_index = int(len(dataset) * split_ratio)
|
||||||
dataset = dataset.sample(frac=1).reset_index(drop=True)
|
# dataset = dataset.sample(frac=1).reset_index(drop=True)
|
||||||
dataset_train = dataset.iloc[:split_index]
|
# dataset_train = dataset.iloc[:split_index]
|
||||||
dataset_test = dataset.iloc[split_index:]
|
# dataset_test = dataset.iloc[split_index:]
|
||||||
|
|
||||||
|
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
||||||
|
|
||||||
# Dataset Test
|
# Dataset Test
|
||||||
# Exportation
|
# Exportation
|
||||||
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
||||||
|
|
|
@ -38,7 +38,7 @@ def generate_train_set(type_of_comp):
|
||||||
print(file)
|
print(file)
|
||||||
with fs.open(file, mode="rb") as file_in:
|
with fs.open(file, mode="rb") as file_in:
|
||||||
df = pd.read_csv(file_in, sep=",")
|
df = pd.read_csv(file_in, sep=",")
|
||||||
train_set = pd.concat([test_set, df], ignore_index = True)
|
train_set = pd.concat([train_set, df], ignore_index = True)
|
||||||
return train_set
|
return train_set
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user