From 122c4c1f825548d56c5b355d998b441de904faef Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 28 Mar 2024 08:35:02 +0000 Subject: [PATCH] fix features --- 0_5_Machine_Learning.py | 2 +- utils_ml.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 79ee8c1..b893aed 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -56,7 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} -preproc = preprocess(type_of_model) +preproc = preprocess(type_of_model, type_of_activity) # Object for storing results model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) diff --git a/utils_ml.py b/utils_ml.py index e8598d3..1955ef9 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -83,7 +83,14 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): def features_target_split(dataset_train, dataset_test): - features_l = ['] + features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open', + 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', + 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', + 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male', + 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', + 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue', + 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille', + 'target_jeune', 'target_abonne'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] @@ -97,12 +104,15 @@ def preprocess(type_of_model, type_of_activity): numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', - 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets'] + 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open'] - binary_features = ['gender_female', 'gender_male', 'country_fr', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', + binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40', 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue', 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in'] + if type_of_activity=='musee': + binary_features.pop('time_to_open') + if type_of_model=='premium': if type_of_activity=='musique': binary_features.extend(['target_optin', 'target_newsletter']) @@ -113,6 +123,7 @@ def preprocess(type_of_model, type_of_activity): numeric_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="constant", fill_value=0)), ("scaler", StandardScaler()) ])