Ajout description marketing personae
This commit is contained in:
parent
7341752be0
commit
8e61e9d2a4
|
@ -1,40 +0,0 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
|
||||
exec(open('utils_segmentation.py').read())
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
# choose the type of companies for which you want to run the pipeline
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
|
||||
# load test set
|
||||
dataset_test = load_test_file(type_of_activity)
|
||||
|
||||
# Load Model
|
||||
model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
|
||||
|
||||
# Processing
|
||||
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
||||
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
|
||||
|
||||
y_test = dataset_test[['y_has_purchased']]
|
||||
|
||||
# Prediction
|
||||
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# Add probability to dataset_test
|
||||
dataset_test['Probability_to_buy'] = y_pred_prob
|
||||
print('probability added to dataset_test')
|
||||
print(dataset_test.head())
|
|
@ -1,99 +0,0 @@
|
|||
### importations ###
|
||||
### not necesary ?? As we exec the utils .py file associated
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
import matplotlib.pyplot as plt
|
||||
"""
|
||||
|
||||
### --- beginning of the code --- ###
|
||||
|
||||
|
||||
### hyperparameters of the code ###
|
||||
|
||||
###################################
|
||||
|
||||
# choose the type of companies for which you want to run the pipeline
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
|
||||
# choose the model we use for the segmentation
|
||||
model_name = "LogisticRegression_Benchmark"
|
||||
|
||||
###################################
|
||||
|
||||
|
||||
# execute file including functions we need
|
||||
exec(open('utils_segmentation_V2TP.py').read())
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
# load test set
|
||||
dataset_test = load_test_file(type_of_activity)
|
||||
|
||||
# Load Model
|
||||
model = load_model(type_of_activity, model_name)
|
||||
|
||||
|
||||
### Preprocessing of data
|
||||
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
||||
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr']]
|
||||
|
||||
y_test = dataset_test[['y_has_purchased']]
|
||||
|
||||
X_test_segment = X_test
|
||||
|
||||
# add y_has_purchased to X_test
|
||||
X_test_segment["has_purchased"] = y_test
|
||||
|
||||
# Add prediction and probability to dataset_test
|
||||
y_pred = model.predict(X_test)
|
||||
X_test_segment["has_purchased_estim"] = y_pred
|
||||
|
||||
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
||||
X_test_segment['score'] = y_pred_prob
|
||||
|
||||
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
|
||||
np.where(X_test_segment['score']<0.5, '2',
|
||||
np.where(X_test_segment['score']<0.75, '3', '4')))
|
||||
|
||||
### 1. business KPIs
|
||||
|
||||
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
|
||||
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
|
||||
|
||||
# save histogram to Minio
|
||||
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
|
||||
"nb_purchases", "total_amount", "nb_campaigns")
|
||||
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
|
||||
|
||||
|
||||
### 2. description of marketing personae (spider chart)
|
||||
|
||||
# table summarizing variables relative to marketing personae
|
||||
X_test_segment_mp = df_segment_mp(X_test_segment, "segment", "gender_female",
|
||||
"gender_male", "gender_other", "country_fr")
|
||||
|
||||
# table relative to purchasing behaviour
|
||||
X_test_segment_pb = df_segment_pb(X_test_segment, "segment", "nb_tickets_internet", "nb_tickets",
|
||||
"nb_campaigns_opened", "nb_campaigns", "opt_in")
|
||||
|
||||
# concatenation of tables to prepare the plot
|
||||
X_test_segment_caract = pd.concat([X_test_segment_pb, X_test_segment_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
|
||||
|
||||
# visualization and save the graphic to the MinIo
|
||||
categories = list(X_test_segment_caract.drop("segment", axis=1).columns)
|
||||
radar_mp_plot_all(df=X_test_segment_caract, categories=categories)
|
||||
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
|
||||
|
82
6_Segmentation_and_Marketing_Personae.py
Normal file
82
6_Segmentation_and_Marketing_Personae.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
|
||||
# Packages
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
import matplotlib.pyplot as plt
|
||||
from tabulate import tabulate
|
||||
|
||||
###################################
|
||||
|
||||
# choose the model we use for the segmentation
|
||||
model_name = "LogisticRegression_Benchmark"
|
||||
|
||||
###################################
|
||||
|
||||
|
||||
# execute file including functions we need
|
||||
exec(open('utils_segmentation.py').read())
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
|
||||
# choose the type of companies for which you want to run the pipeline
|
||||
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
for type_of_activity in ['musee', 'sport', 'musique'] :
|
||||
|
||||
|
||||
# load test set
|
||||
dataset_test = load_test_file(type_of_activity)
|
||||
|
||||
# Load Model
|
||||
model = load_model(type_of_activity, model_name)
|
||||
|
||||
|
||||
### Preprocessing of data
|
||||
X_test = dataset_test.drop(columns = 'y_has_purchased')
|
||||
|
||||
y_test = dataset_test[['y_has_purchased']]
|
||||
|
||||
X_test_segment = X_test
|
||||
|
||||
# add y_has_purchased to X_test
|
||||
X_test_segment["has_purchased"] = y_test
|
||||
|
||||
# Add prediction and probability to dataset_test
|
||||
y_pred = model.predict(X_test)
|
||||
X_test_segment["has_purchased_estim"] = y_pred
|
||||
|
||||
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
||||
X_test_segment['score'] = y_pred_prob
|
||||
|
||||
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
|
||||
np.where(X_test_segment['score']<0.5, '2',
|
||||
np.where(X_test_segment['score']<0.75, '3', '4')))
|
||||
|
||||
### 1. business KPIs
|
||||
|
||||
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
|
||||
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
|
||||
|
||||
# save histogram to Minio
|
||||
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
|
||||
"nb_purchases", "total_amount", "nb_campaigns", type_of_activity)
|
||||
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
|
||||
|
||||
|
||||
### 2. description of marketing personae
|
||||
## A. Spider chart
|
||||
radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity)
|
||||
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
|
||||
|
||||
## B. Latex table
|
||||
known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity)
|
|
@ -1,15 +1,12 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
### importations ###
|
||||
|
||||
|
||||
|
||||
|
||||
### functions for segmentation and graphics associated ###
|
||||
|
||||
def load_model(type_of_activity, model):
|
||||
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
||||
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
|
||||
filename = model + '.pkl'
|
||||
file_path = BUCKET + filename
|
||||
with fs.open(file_path, mode="rb") as f:
|
||||
|
@ -20,8 +17,207 @@ def load_model(type_of_activity, model):
|
|||
|
||||
|
||||
def load_test_file(type_of_activity):
|
||||
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
|
||||
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
|
||||
with fs.open(file_path_test, mode="rb") as file_in:
|
||||
dataset_test = pd.read_csv(file_in, sep=",")
|
||||
return dataset_test
|
||||
|
||||
|
||||
def save_file_s3_mp(File_name, type_of_activity):
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png', dpi=110)
|
||||
image_buffer.seek(0)
|
||||
PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||
s3_file.write(image_buffer.read())
|
||||
plt.close()
|
||||
|
||||
def save_txt_file_s3(file_name, type_of_activity, content):
|
||||
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
|
||||
s3_file.write(content)
|
||||
|
||||
def df_business_fig(df, segment, list_var) :
|
||||
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
|
||||
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
|
||||
all_var = ["size"] + list_var
|
||||
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
|
||||
|
||||
return df_business_kpi
|
||||
|
||||
|
||||
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
|
||||
|
||||
plt.figure()
|
||||
|
||||
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
|
||||
|
||||
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
|
||||
"number of\ncampaigns"]
|
||||
|
||||
bottom = np.zeros(5)
|
||||
|
||||
# types of blue color
|
||||
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
|
||||
|
||||
for i in range(4) :
|
||||
height = list(df_plot.loc[i,size:].values)
|
||||
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
|
||||
bottom+=height
|
||||
|
||||
# Ajust margins
|
||||
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
|
||||
|
||||
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
|
||||
plt.ylabel("Fraction represented by the segment (%)")
|
||||
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
|
||||
# plt.show()
|
||||
|
||||
|
||||
# def df_segment_mp(df) :
|
||||
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
|
||||
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
|
||||
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
|
||||
# return df_mp
|
||||
|
||||
|
||||
# def df_segment_pb (df) :
|
||||
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
|
||||
# return df_pb
|
||||
|
||||
|
||||
def radar_mp_plot(df, categories, index) :
|
||||
categories = categories
|
||||
|
||||
# true values are used to print the true value in parenthesis
|
||||
tvalues = list(df.loc[index,categories])
|
||||
|
||||
max_values = df[categories].max()
|
||||
|
||||
# values are true values / max among the 4 segments, allows to
|
||||
# put values in relation with the values for other segments
|
||||
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||
# , event if not equal to 1
|
||||
|
||||
values = list(df.loc[index,categories]/max_values)
|
||||
|
||||
# values normalized are used to adjust the value around the circle
|
||||
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||
# at 8/10th of the circle radius, not at the edge
|
||||
values_normalized = [ max(values) * elt for elt in values]
|
||||
|
||||
# Nb of categories
|
||||
num_categories = len(categories)
|
||||
|
||||
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||
|
||||
# Initialize graphic
|
||||
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
|
||||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
|
||||
|
||||
# fill the sector
|
||||
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
|
||||
|
||||
# labels
|
||||
ax.set_yticklabels([])
|
||||
ax.set_xticks(angles)
|
||||
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
|
||||
ax.set_xticklabels(ticks, color="black")
|
||||
|
||||
ax.spines['polar'].set_visible(False)
|
||||
|
||||
plt.title(f'Characteristics of the segment {index+1}\n')
|
||||
|
||||
# plt.show()
|
||||
|
||||
|
||||
def radar_mp_plot_all(df, type_of_activity) :
|
||||
|
||||
# table summarizing variables relative to marketing personae
|
||||
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
|
||||
df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
|
||||
df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
|
||||
|
||||
# table relative to purchasing behaviour
|
||||
df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
|
||||
|
||||
# concatenation of tables to prepare the plot
|
||||
df_used = pd.concat([df_pb, df_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1)
|
||||
|
||||
# visualization
|
||||
nb_segments = df_used.shape[0]
|
||||
categories = list(df_used.drop("segment", axis=1).columns)
|
||||
|
||||
# Initialize graphic
|
||||
fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))
|
||||
|
||||
for index in range(nb_segments) :
|
||||
row = index // 2 # Division entière pour obtenir le numéro de ligne
|
||||
col = index % 2
|
||||
|
||||
# true values are used to print the true value in parenthesis
|
||||
tvalues = list(df_used.loc[index,categories])
|
||||
|
||||
max_values = df_used[categories].max()
|
||||
|
||||
# values are true values / max among the 4 segments, allows to
|
||||
# put values in relation with the values for other segments
|
||||
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||
# , event if not equal to 1
|
||||
|
||||
values = list(df_used.loc[index,categories]/max_values)
|
||||
|
||||
# values normalized are used to adjust the value around the circle
|
||||
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||
# at 8/10th of the circle radius, not at the edge
|
||||
values_normalized = [ max(values) * elt for elt in values]
|
||||
|
||||
# Nb of categories
|
||||
num_categories = len(categories)
|
||||
|
||||
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
|
||||
linewidth=1.2)
|
||||
|
||||
# fill the sector
|
||||
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
|
||||
|
||||
# labels
|
||||
ax[row, col].set_yticklabels([])
|
||||
ax[row, col].set_xticks(angles)
|
||||
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
|
||||
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
|
||||
|
||||
ax[row, col].spines['polar'].set_visible(False)
|
||||
|
||||
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
|
||||
|
||||
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
|
||||
# plt.show()
|
||||
|
||||
def known_sociodemo_caracteristics(df, type_of_activity) :
|
||||
|
||||
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
|
||||
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
|
||||
table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
|
||||
|
||||
# Arrondir les valeurs du DataFrame à une décimale
|
||||
table_share_known_rounded = table_share_known.round(1)
|
||||
|
||||
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
|
||||
latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
|
||||
latex_table = latex_table.replace('%', '\\%')
|
||||
|
||||
save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)
|
||||
|
||||
|
||||
|
|
|
@ -1,201 +0,0 @@
|
|||
### importations ###
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
### functions for segmentation and graphics associated ###
|
||||
|
||||
def load_model(type_of_activity, model):
|
||||
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
||||
filename = model + '.pkl'
|
||||
file_path = BUCKET + filename
|
||||
with fs.open(file_path, mode="rb") as f:
|
||||
model_bytes = f.read()
|
||||
|
||||
model = pickle.loads(model_bytes)
|
||||
return model
|
||||
|
||||
|
||||
def load_test_file(type_of_activity):
|
||||
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
|
||||
with fs.open(file_path_test, mode="rb") as file_in:
|
||||
dataset_test = pd.read_csv(file_in, sep=",")
|
||||
return dataset_test
|
||||
|
||||
|
||||
def save_file_s3_mp(File_name, type_of_activity):
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png', dpi=110)
|
||||
image_buffer.seek(0)
|
||||
PATH = f"projet-bdc2324-team1/Output_marketing_personae_analysis/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||
s3_file.write(image_buffer.read())
|
||||
plt.close()
|
||||
|
||||
|
||||
def df_business_fig(df, segment, list_var) :
|
||||
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
|
||||
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
|
||||
all_var = ["size"] + list_var
|
||||
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
|
||||
|
||||
return df_business_kpi
|
||||
|
||||
|
||||
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns) :
|
||||
|
||||
plt.figure()
|
||||
|
||||
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
|
||||
|
||||
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
|
||||
"number of\ncampaigns"]
|
||||
|
||||
bottom = np.zeros(5)
|
||||
|
||||
# types of blue color
|
||||
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
|
||||
|
||||
for i in range(4) :
|
||||
height = list(df_plot.loc[i,size:].values)
|
||||
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
|
||||
bottom+=height
|
||||
|
||||
# Ajust margins
|
||||
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
|
||||
|
||||
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
|
||||
plt.ylabel("Fraction represented by the segment (%)")
|
||||
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
|
||||
# plt.show()
|
||||
|
||||
|
||||
def df_segment_mp(df, segment, gender_female, gender_male, gender_other, country_fr) :
|
||||
df_mp = df.groupby(segment)[[gender_female, gender_male, gender_other, country_fr]].mean().reset_index()
|
||||
df_mp.insert(3, "share_known_gender", df_mp[gender_female]+df_mp[gender_male])
|
||||
df_mp.insert(4, "share_of_women", df_mp[gender_female]/(df_mp["share_known_gender"]))
|
||||
return df_mp
|
||||
|
||||
|
||||
def df_segment_pb (df, segment, nb_tickets_internet, nb_tickets, nb_campaigns_opened, nb_campaigns, opt_in) :
|
||||
df_used = df
|
||||
df_used["share_tickets_internet"] = df_used[nb_tickets_internet]/df_used[nb_tickets]
|
||||
df_used["share_campaigns_opened"] = df_used[nb_campaigns_opened]/df_used[nb_campaigns]
|
||||
df_pb = df_used.groupby(segment)[["share_tickets_internet", "share_campaigns_opened", opt_in]].mean().reset_index()
|
||||
return df_pb
|
||||
|
||||
|
||||
def radar_mp_plot(df, categories, index) :
|
||||
categories = categories
|
||||
|
||||
# true values are used to print the true value in parenthesis
|
||||
tvalues = list(df.loc[index,categories])
|
||||
|
||||
max_values = df[categories].max()
|
||||
|
||||
# values are true values / max among the 4 segments, allows to
|
||||
# put values in relation with the values for other segments
|
||||
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||
# , event if not equal to 1
|
||||
|
||||
values = list(df.loc[index,categories]/max_values)
|
||||
|
||||
# values normalized are used to adjust the value around the circle
|
||||
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||
# at 8/10th of the circle radius, not at the edge
|
||||
values_normalized = [ max(values) * elt for elt in values]
|
||||
|
||||
# Nb of categories
|
||||
num_categories = len(categories)
|
||||
|
||||
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||
|
||||
# Initialize graphic
|
||||
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
|
||||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
|
||||
|
||||
# fill the sector
|
||||
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
|
||||
|
||||
# labels
|
||||
ax.set_yticklabels([])
|
||||
ax.set_xticks(angles)
|
||||
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
|
||||
ax.set_xticklabels(ticks, color="black")
|
||||
|
||||
ax.spines['polar'].set_visible(False)
|
||||
|
||||
plt.title(f'Characteristics of the segment {index+1}\n')
|
||||
|
||||
# plt.show()
|
||||
|
||||
|
||||
def radar_mp_plot_all(df, categories) :
|
||||
|
||||
nb_segments = df.shape[0]
|
||||
categories = categories
|
||||
|
||||
# Initialize graphic
|
||||
fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))
|
||||
|
||||
for index in range(nb_segments) :
|
||||
row = index // 2 # Division entière pour obtenir le numéro de ligne
|
||||
col = index % 2
|
||||
|
||||
# true values are used to print the true value in parenthesis
|
||||
tvalues = list(df.loc[index,categories])
|
||||
|
||||
max_values = df[categories].max()
|
||||
|
||||
# values are true values / max among the 4 segments, allows to
|
||||
# put values in relation with the values for other segments
|
||||
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||
# , event if not equal to 1
|
||||
|
||||
values = list(df.loc[index,categories]/max_values)
|
||||
|
||||
# values normalized are used to adjust the value around the circle
|
||||
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||
# at 8/10th of the circle radius, not at the edge
|
||||
values_normalized = [ max(values) * elt for elt in values]
|
||||
|
||||
# Nb of categories
|
||||
num_categories = len(categories)
|
||||
|
||||
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
|
||||
linewidth=1.2)
|
||||
|
||||
# fill the sector
|
||||
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
|
||||
|
||||
# labels
|
||||
ax[row, col].set_yticklabels([])
|
||||
ax[row, col].set_xticks(angles)
|
||||
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
|
||||
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
|
||||
|
||||
ax[row, col].spines['polar'].set_visible(False)
|
||||
|
||||
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
|
||||
|
||||
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
|
||||
# plt.show()
|
||||
|
Loading…
Reference in New Issue
Block a user