take new databases as input

This commit is contained in:
Thomas PIQUE 2024-03-30 11:00:49 +00:00
parent 78aab14164
commit 0a7900c07f
3 changed files with 1071 additions and 1142 deletions

View File

@ -27,7 +27,8 @@ type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? mu
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
# type of model for the score
type_of_model = "LogisticRegression_cv"
# type_of_model = "LogisticRegression_cv"
type_of_model = "LogisticRegression_Benchmark"
# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)
@ -68,6 +69,8 @@ save_file_s3_ca("hist_score_adjusted_", type_of_activity)
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
print(X_test_table_adjusted_scores)
# save table
file_name = "table_adjusted_score_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"

File diff suppressed because one or more lines are too long

View File

@ -13,7 +13,8 @@ import io
# functions
def load_train_test(type_of_activity):
BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
# BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
File_path_test = BUCKET + "/Test_set.csv"
@ -31,7 +32,7 @@ def load_train_test(type_of_activity):
def features_target_split(dataset_train, dataset_test):
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',
'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
# we suppress fidelity, time between purchase, and gender other (colinearity issue)
@ -41,17 +42,18 @@ def features_target_split(dataset_train, dataset_test):
'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
"""
X_train = dataset_train[features_l]
X_train = dataset_train # [features_l]
y_train = dataset_train[['y_has_purchased']]
X_test = dataset_test[features_l]
X_test = dataset_test # [features_l]
y_test = dataset_test[['y_has_purchased']]
return X_train, X_test, y_train, y_test
def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
# BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
BUCKET = f"projet-bdc2324-team1/basique/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f: