BDC-team-1/0_6_segmentation_V2TP.py

105 lines
3.3 KiB
Python

### importations ###
### not necesary ?? As we exec the utils .py file associated
"""
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
"""
### --- beginning of the code --- ###
### hyperparameters of the code ###
###################################
# choose the type of companies for which you want to run the pipeline
activity = "sport"
# choose the model we use for the segmentation
model_name = "LogisticRegression_Benchmark"
###################################
# execute file including functions we need
exec(open('utils_segmentation_2TP.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, model_name)
### Preprocessing of data
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
y_test = dataset_test[['y_has_purchased']]
X_test_segment = X_test
X_test_segment.insert(X_test.shape[1], "country_fr", dataset_test["country_fr"])
# add y_has_purchased to X_test
X_test_segment["has_purchased"] = y_test
# Add prediction and probability to dataset_test
y_pred = model.predict(X_test)
X_test_segment["has_purchased_estim"] = y_pred
y_pred_prob = model.predict_proba(X_test)[:, 1]
X_test_segment['score'] = y_pred_prob
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
np.where(X_test_segment['score']<0.5, '2',
np.where(X_test_segment['score']<0.75, '3', '4')))
### 1. business KPIs
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
# save histogram to Minio
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
"nb_purchases", "total_amount", "nb_campaigns")
save_file_s3_mp(File_name = "segments_business_KPIs_", type_of_activity = activity)
### 2. description of marketing personae (spider chart)
# table summarizing variables relative to marketing personae
X_test_segment_mp = df_segment_mp(X_test_segment, "segment", "gender_female",
"gender_male", "gender_other", "country_fr")
# table relative to purchasing behaviour
X_test_segment_pb = df_segment_pb(X_test_segment, "segment", "nb_tickets_internet", "nb_tickets",
"nb_campaigns_opened", "nb_campaigns", "opt_in")
# concatenation of tables to prepare the plot
X_test_segment_caract = pd.concat([X_test_segment_pb, X_test_segment_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
# visualization and save the graphic to the MinIo
categories = list(X_test_segment_caract.drop("segment", axis=1).columns)
radar_mp_plot_all(df=X_test_segment_caract, categories=categories)
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = activity)