diff --git a/0_6_Segmentation.py b/0_6_Segmentation.py index 19ea3da..7331442 100644 --- a/0_6_Segmentation.py +++ b/0_6_Segmentation.py @@ -11,8 +11,30 @@ import warnings exec(open('utils_segmentation.py').read()) warnings.filterwarnings('ignore') +# Create filesystem object +S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] +fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) + +# choose the type of companies for which you want to run the pipeline +type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') + +# load test set +dataset_test = load_test_file(type_of_activity) + # Load Model +model = load_model(type_of_activity, 'LogisticRegression_Benchmark') +# Processing +X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', + 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', + 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']] +y_test = dataset_test[['y_has_purchased']] -model = load_model(type_of_activity, model) \ No newline at end of file +# Prediction +y_pred_prob = model.predict_proba(X_test)[:, 1] + +# Add probability to dataset_test +dataset_test['Probability_to_buy'] = y_pred_prob +print('probability added to dataset_test') +print(dataset_test.head()) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index 8fb74d5..e801964 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -84,7 +84,7 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): def features_target_split(dataset_train, dataset_test): features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', - 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', + 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] diff --git a/utils_segmentation.py b/utils_segmentation.py index 56b3e1f..42f3afb 100644 --- a/utils_segmentation.py +++ b/utils_segmentation.py @@ -18,3 +18,10 @@ def load_model(type_of_activity, model): model = pickle.loads(model_bytes) return model + +def load_test_file(type_of_activity): + file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv" + with fs.open(file_path_test, mode="rb") as file_in: + dataset_test = pd.read_csv(file_in, sep=",") + return dataset_test +