from notebook to .py for segment analysis

This commit is contained in:
Thomas PIQUE 2024-03-27 14:59:33 +00:00
parent 28cc7b94ea
commit 10fde045e5
3 changed files with 1208 additions and 195 deletions

104
0_6_segmentation_V2TP.py Normal file
View File

@ -0,0 +1,104 @@
### importations ###
### not necesary ?? As we exec the utils .py file associated
"""
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
"""
### --- beginning of the code --- ###
### hyperparameters of the code ###
###################################
# choose the type of companies for which you want to run the pipeline
activity = "sport"
# choose the model we use for the segmentation
model_name = "LogisticRegression_Benchmark"
###################################
# execute file including functions we need
exec(open('utils_segmentation_2TP.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, model_name)
### Preprocessing of data
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
y_test = dataset_test[['y_has_purchased']]
X_test_segment = X_test
X_test_segment.insert(X_test.shape[1], "country_fr", dataset_test["country_fr"])
# add y_has_purchased to X_test
X_test_segment["has_purchased"] = y_test
# Add prediction and probability to dataset_test
y_pred = model.predict(X_test)
X_test_segment["has_purchased_estim"] = y_pred
y_pred_prob = model.predict_proba(X_test)[:, 1]
X_test_segment['score'] = y_pred_prob
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
np.where(X_test_segment['score']<0.5, '2',
np.where(X_test_segment['score']<0.75, '3', '4')))
### 1. business KPIs
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
# save histogram to Minio
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
"nb_purchases", "total_amount", "nb_campaigns")
save_file_s3_mp(File_name = "segments_business_KPIs_", type_of_activity = activity)
### 2. description of marketing personae (spider chart)
# table summarizing variables relative to marketing personae
X_test_segment_mp = df_segment_mp(X_test_segment, "segment", "gender_female",
"gender_male", "gender_other", "country_fr")
# table relative to purchasing behaviour
X_test_segment_pb = df_segment_pb(X_test_segment, "segment", "nb_tickets_internet", "nb_tickets",
"nb_campaigns_opened", "nb_campaigns", "opt_in")
# concatenation of tables to prepare the plot
X_test_segment_caract = pd.concat([X_test_segment_pb, X_test_segment_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
# visualization and save the graphic to the MinIo
categories = list(X_test_segment_caract.drop("segment", axis=1).columns)
radar_mp_plot_all(df=X_test_segment_caract, categories=categories)
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = activity)

File diff suppressed because one or more lines are too long

204
utils_segmentation_V2TP.py Normal file
View File

@ -0,0 +1,204 @@
### importations ###
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
### functions for segmentation and graphics associated ###
def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def load_test_file(type_of_activity):
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test
def save_file_s3_mp(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/Output_marketing_personae_analysis/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def df_business_fig(df, segment, list_var) :
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
return df_business_kpi
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns) :
plt.figure()
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
"number of\ncampaigns"]
bottom = np.zeros(5)
# types of blue color
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
for i in range(4) :
height = list(df_plot.loc[i,size:].values)
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
bottom+=height
# Ajust margins
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
plt.ylabel("Fraction represented by the segment (%)")
plt.title("Relative weight of each segment regarding business KPIs")
# plt.show()
def df_segment_mp(df, segment, gender_female, gender_male, gender_other, country_fr) :
df_mp = df.groupby(segment)[[gender_female, gender_male, gender_other, country_fr]].mean().reset_index()
df_mp.insert(3, "share_known_gender", X_test_segment_mp[gender_female]+X_test_segment_mp[gender_male])
df_mp.insert(4, "share_of_women", X_test_segment_mp[gender_female]/(X_test_segment_mp["share_known_gender"]))
return df_mp
def df_segment_pb (df, segment, nb_tickets_internet, nb_tickets, nb_campaigns_opened, nb_campaigns, opt_in) :
df_used = df
df_used["share_tickets_internet"] = df_used[nb_tickets_internet]/df_used[nb_tickets]
df_used["share_campaigns_opened"] = df_used[nb_campaigns_opened]/df_used[nb_campaigns]
df_pb = df_used.groupby(segment)[["share_tickets_internet", "share_campaigns_opened", opt_in]].mean().reset_index()
return df_pb
def radar_mp_plot(df, categories, index) :
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# Initialize graphic
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
# fill the sector
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
# labels
ax.set_yticklabels([])
ax.set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax.set_xticklabels(ticks, color="black")
ax.spines['polar'].set_visible(False)
plt.title(f'Characteristics of the segment {index+1}\n')
# plt.show()
def radar_mp_plot_all(df, categories) :
nb_segments = df.shape[0]
categories = categories
# Initialize graphic
fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))
for index in range(nb_segments) :
row = index // 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
df = X_test_segment_caract
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
# fill the sector
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
# labels
ax[row, col].set_yticklabels([])
ax[row, col].set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
ax[row, col].spines['polar'].set_visible(False)
ax[row, col].set_title(f'Characteristics of the segment {index+1}\n', size = 24)
# plt.show()