Ajout description marketing personae

This commit is contained in:
Antoine JOUBREL 2024-03-31 16:35:58 +00:00
parent 7341752be0
commit 8e61e9d2a4
5 changed files with 288 additions and 350 deletions

View File

@ -1,40 +0,0 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
exec(open('utils_segmentation.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
# Processing
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
y_test = dataset_test[['y_has_purchased']]
# Prediction
y_pred_prob = model.predict_proba(X_test)[:, 1]
# Add probability to dataset_test
dataset_test['Probability_to_buy'] = y_pred_prob
print('probability added to dataset_test')
print(dataset_test.head())

View File

@ -1,99 +0,0 @@
### importations ###
### not necesary ?? As we exec the utils .py file associated
"""
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
"""
### --- beginning of the code --- ###
### hyperparameters of the code ###
###################################
# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# choose the model we use for the segmentation
model_name = "LogisticRegression_Benchmark"
###################################
# execute file including functions we need
exec(open('utils_segmentation_V2TP.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, model_name)
### Preprocessing of data
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr']]
y_test = dataset_test[['y_has_purchased']]
X_test_segment = X_test
# add y_has_purchased to X_test
X_test_segment["has_purchased"] = y_test
# Add prediction and probability to dataset_test
y_pred = model.predict(X_test)
X_test_segment["has_purchased_estim"] = y_pred
y_pred_prob = model.predict_proba(X_test)[:, 1]
X_test_segment['score'] = y_pred_prob
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
np.where(X_test_segment['score']<0.5, '2',
np.where(X_test_segment['score']<0.75, '3', '4')))
### 1. business KPIs
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
# save histogram to Minio
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
"nb_purchases", "total_amount", "nb_campaigns")
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
### 2. description of marketing personae (spider chart)
# table summarizing variables relative to marketing personae
X_test_segment_mp = df_segment_mp(X_test_segment, "segment", "gender_female",
"gender_male", "gender_other", "country_fr")
# table relative to purchasing behaviour
X_test_segment_pb = df_segment_pb(X_test_segment, "segment", "nb_tickets_internet", "nb_tickets",
"nb_campaigns_opened", "nb_campaigns", "opt_in")
# concatenation of tables to prepare the plot
X_test_segment_caract = pd.concat([X_test_segment_pb, X_test_segment_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
# visualization and save the graphic to the MinIo
categories = list(X_test_segment_caract.drop("segment", axis=1).columns)
radar_mp_plot_all(df=X_test_segment_caract, categories=categories)
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)

View File

@ -0,0 +1,82 @@
# Packages
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
from tabulate import tabulate
###################################
# choose the model we use for the segmentation
model_name = "LogisticRegression_Benchmark"
###################################
# execute file including functions we need
exec(open('utils_segmentation.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# choose the type of companies for which you want to run the pipeline
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
for type_of_activity in ['musee', 'sport', 'musique'] :
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, model_name)
### Preprocessing of data
X_test = dataset_test.drop(columns = 'y_has_purchased')
y_test = dataset_test[['y_has_purchased']]
X_test_segment = X_test
# add y_has_purchased to X_test
X_test_segment["has_purchased"] = y_test
# Add prediction and probability to dataset_test
y_pred = model.predict(X_test)
X_test_segment["has_purchased_estim"] = y_pred
y_pred_prob = model.predict_proba(X_test)[:, 1]
X_test_segment['score'] = y_pred_prob
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
np.where(X_test_segment['score']<0.5, '2',
np.where(X_test_segment['score']<0.75, '3', '4')))
### 1. business KPIs
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
# save histogram to Minio
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
"nb_purchases", "total_amount", "nb_campaigns", type_of_activity)
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
### 2. description of marketing personae
## A. Spider chart
radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity)
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
## B. Latex table
known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity)

View File

@ -1,15 +1,12 @@
import pandas as pd ### importations ###
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
### functions for segmentation and graphics associated ###
def load_model(type_of_activity, model): def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl' filename = model + '.pkl'
file_path = BUCKET + filename file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f: with fs.open(file_path, mode="rb") as f:
@ -20,8 +17,207 @@ def load_model(type_of_activity, model):
def load_test_file(type_of_activity): def load_test_file(type_of_activity):
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv" file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in: with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",") dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test return dataset_test
def save_file_s3_mp(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=110)
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def save_txt_file_s3(file_name, type_of_activity, content):
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
s3_file.write(content)
def df_business_fig(df, segment, list_var) :
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
return df_business_kpi
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
plt.figure()
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
"number of\ncampaigns"]
bottom = np.zeros(5)
# types of blue color
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
for i in range(4) :
height = list(df_plot.loc[i,size:].values)
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
bottom+=height
# Ajust margins
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
plt.ylabel("Fraction represented by the segment (%)")
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
# plt.show()
# def df_segment_mp(df) :
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
# return df_mp
# def df_segment_pb (df) :
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# return df_pb
def radar_mp_plot(df, categories, index) :
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# Initialize graphic
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
# fill the sector
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
# labels
ax.set_yticklabels([])
ax.set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax.set_xticklabels(ticks, color="black")
ax.spines['polar'].set_visible(False)
plt.title(f'Characteristics of the segment {index+1}\n')
# plt.show()
def radar_mp_plot_all(df, type_of_activity) :
# table summarizing variables relative to marketing personae
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
# table relative to purchasing behaviour
df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# concatenation of tables to prepare the plot
df_used = pd.concat([df_pb, df_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
# visualization
nb_segments = df_used.shape[0]
categories = list(df_used.drop("segment", axis=1).columns)
# Initialize graphic
fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))
for index in range(nb_segments) :
row = index // 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
# true values are used to print the true value in parenthesis
tvalues = list(df_used.loc[index,categories])
max_values = df_used[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df_used.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
# fill the sector
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
# labels
ax[row, col].set_yticklabels([])
ax[row, col].set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
ax[row, col].spines['polar'].set_visible(False)
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
# plt.show()
def known_sociodemo_caracteristics(df, type_of_activity) :
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
# Arrondir les valeurs du DataFrame à une décimale
table_share_known_rounded = table_share_known.round(1)
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
latex_table = latex_table.replace('%', '\\%')
save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)

View File

@ -1,201 +0,0 @@
### importations ###
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
### functions for segmentation and graphics associated ###
def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def load_test_file(type_of_activity):
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test
def save_file_s3_mp(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=110)
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/Output_marketing_personae_analysis/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def df_business_fig(df, segment, list_var) :
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
return df_business_kpi
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns) :
plt.figure()
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
"number of\ncampaigns"]
bottom = np.zeros(5)
# types of blue color
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
for i in range(4) :
height = list(df_plot.loc[i,size:].values)
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
bottom+=height
# Ajust margins
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
plt.ylabel("Fraction represented by the segment (%)")
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
# plt.show()
def df_segment_mp(df, segment, gender_female, gender_male, gender_other, country_fr) :
df_mp = df.groupby(segment)[[gender_female, gender_male, gender_other, country_fr]].mean().reset_index()
df_mp.insert(3, "share_known_gender", df_mp[gender_female]+df_mp[gender_male])
df_mp.insert(4, "share_of_women", df_mp[gender_female]/(df_mp["share_known_gender"]))
return df_mp
def df_segment_pb (df, segment, nb_tickets_internet, nb_tickets, nb_campaigns_opened, nb_campaigns, opt_in) :
df_used = df
df_used["share_tickets_internet"] = df_used[nb_tickets_internet]/df_used[nb_tickets]
df_used["share_campaigns_opened"] = df_used[nb_campaigns_opened]/df_used[nb_campaigns]
df_pb = df_used.groupby(segment)[["share_tickets_internet", "share_campaigns_opened", opt_in]].mean().reset_index()
return df_pb
def radar_mp_plot(df, categories, index) :
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# Initialize graphic
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
# fill the sector
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
# labels
ax.set_yticklabels([])
ax.set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax.set_xticklabels(ticks, color="black")
ax.spines['polar'].set_visible(False)
plt.title(f'Characteristics of the segment {index+1}\n')
# plt.show()
def radar_mp_plot_all(df, categories) :
nb_segments = df.shape[0]
categories = categories
# Initialize graphic
fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))
for index in range(nb_segments) :
row = index // 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
# fill the sector
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
# labels
ax[row, col].set_yticklabels([])
ax[row, col].set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
ax[row, col].spines['polar'].set_visible(False)
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
# plt.show()