take new databases as input

2024-03-30 11:00:49 +00:00 · 2024-03-30 11:00:49 +00:00 · 0a7900c07f
commit 0a7900c07f
parent 78aab14164
3 changed files with 1071 additions and 1142 deletions
--- a/0_7_CA_segment.py
+++ b/0_7_CA_segment.py
@ -27,7 +27,8 @@ type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? mu
 PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"

 # type of model for the score
-type_of_model = "LogisticRegression_cv"
+# type_of_model = "LogisticRegression_cv"
+type_of_model = "LogisticRegression_Benchmark"

 # load train and test sets
 dataset_train, dataset_test = load_train_test(type_of_activity)
@ -68,6 +69,8 @@ save_file_s3_ca("hist_score_adjusted_", type_of_activity)
 X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
 X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})

+print(X_test_table_adjusted_scores)
+
 # save table
 file_name = "table_adjusted_score_"
 FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
--- a/Sport/Modelization/CA_segment_sport.ipynb
+++ b/Sport/Modelization/CA_segment_sport.ipynb
--- a/utils_CA_segment.py
+++ b/utils_CA_segment.py
@ -13,7 +13,8 @@ import io
 # functions

 def load_train_test(type_of_activity):
-    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
+    # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
+    BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    
@ -31,7 +32,7 @@ def load_train_test(type_of_activity):
 def features_target_split(dataset_train, dataset_test):
    
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
-            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
+            'time_between_purchase', 'fidelity',  'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']

    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
@ -41,17 +42,18 @@ def features_target_split(dataset_train, dataset_test):
                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
    """
    
-    X_train = dataset_train[features_l]
+    X_train = dataset_train # [features_l]
    y_train = dataset_train[['y_has_purchased']]

-    X_test = dataset_test[features_l]
+    X_test = dataset_test # [features_l]
    y_test = dataset_test[['y_has_purchased']]
    
    return X_train, X_test, y_train, y_test
    

 def load_model(type_of_activity, model):
-    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
+    # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
+    BUCKET = f"projet-bdc2324-team1/basique/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f: