generalization #7
|
@ -42,7 +42,7 @@ def compute_time_intersection(datecover):
|
||||||
return sorted(formated_dates)
|
return sorted(formated_dates)
|
||||||
|
|
||||||
|
|
||||||
def df_coverage_modelization(sport, coverage_train = 0.7):
|
def df_coverage_modelization(sport, coverage_features = 0.7):
|
||||||
"""
|
"""
|
||||||
This function returns start_date, end_of_features and final dates
|
This function returns start_date, end_of_features and final dates
|
||||||
that help to construct train and test datasets
|
that help to construct train and test datasets
|
||||||
|
@ -81,7 +81,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||||
|
|
||||||
#Filtre de la base df_products_purchased_reduced
|
#Filtre de la base df_products_purchased_reduced
|
||||||
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||||
|
|
||||||
print("Data filtering : SUCCESS")
|
print("Data filtering : SUCCESS")
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
|
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
|
||||||
|
|
||||||
# KPI sur le comportement d'achat
|
# KPI sur le comportement d'achat
|
||||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
||||||
|
|
||||||
# KPI sur les données socio-démographiques
|
# KPI sur les données socio-démographiques
|
||||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||||
|
@ -146,20 +146,44 @@ BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
|
||||||
|
|
||||||
# Create test dataset and train dataset for sport companies
|
# Create test dataset and train dataset for sport companies
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
|
||||||
|
=======
|
||||||
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
|
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
|
||||||
|
>>>>>>> main
|
||||||
start_date = "2021-05-01"
|
start_date = "2021-05-01"
|
||||||
end_of_features = "2022-11-01"
|
end_of_features = "2022-11-01"
|
||||||
final_date = "2023-11-01"
|
final_date = "2023-11-01"
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
anonymous_customer = {'1' : 1_1, '2' : 2_12184, '3' : 3_1, '4' : 4_2, '101' : 101_1,
|
||||||
|
'5' : 5_191835, '6' : 6_591412, '7' : 7_49632, '8' : 8_1942, '9' : 9_19683}
|
||||||
|
=======
|
||||||
anonymous_customer = {'1' : 1, '2' : 12184, '3' : 1, '4' : 2, '101' : 1,
|
anonymous_customer = {'1' : 1, '2' : 12184, '3' : 1, '4' : 2, '101' : 1,
|
||||||
'5' : 191835, '6' : 591412, '7' : 49632, '8' : 1942, '9' : 19683}
|
'5' : 191835, '6' : 591412, '7' : 49632, '8' : 1942, '9' : 19683}
|
||||||
|
>>>>>>> main
|
||||||
|
|
||||||
for company in list_of_comp:
|
for company in list_of_comp:
|
||||||
dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
||||||
max_date = final_date, directory_path = company)
|
max_date = final_date, directory_path = company)
|
||||||
|
|
||||||
|
# On retire le client anonyme
|
||||||
|
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
#train test set
|
||||||
|
np.random.seed(42)
|
||||||
|
|
||||||
|
# Dataset Test
|
||||||
|
split_ratio = 0.7
|
||||||
|
split_index = int(len(dataset) * split_ratio)
|
||||||
|
dataset = dataset.sample(frac=1).reset_index(drop=True)
|
||||||
|
dataset_train = dataset.iloc[:split_index]
|
||||||
|
dataset_test = dataset.iloc[split_index:]
|
||||||
|
=======
|
||||||
# On retire le client anonyme
|
# On retire le client anonyme
|
||||||
dataset_test = dataset_test[dataset_test['customer_id'] != anonymous_customer[company]]
|
dataset_test = dataset_test[dataset_test['customer_id'] != anonymous_customer[company]]
|
||||||
|
>>>>>>> main
|
||||||
|
|
||||||
# Exportation
|
# Exportation
|
||||||
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
||||||
|
@ -170,12 +194,11 @@ for company in list_of_comp:
|
||||||
|
|
||||||
print("Exportation dataset test : SUCCESS")
|
print("Exportation dataset test : SUCCESS")
|
||||||
|
|
||||||
# Dataset train
|
# Dataset train
|
||||||
dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
|
||||||
max_date = final_date, directory_path = company)
|
|
||||||
# Export
|
# Export
|
||||||
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_test/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
dataset_train.to_csv(file_out, index = False)
|
dataset_train.to_csv(file_out, index = False)
|
||||||
|
|
|
@ -1,14 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import s3fs
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
# Create filesystem object
|
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
|
||||||
|
|
||||||
# Ignore warning
|
|
||||||
warnings.filterwarnings('ignore')
|
|
||||||
|
|
1434
Sport/Descriptive_statistics/stat_desc_sport.ipynb
Normal file
1434
Sport/Descriptive_statistics/stat_desc_sport.ipynb
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user