BDC-team-1/utils_segmentation.py

336 lines
13 KiB
Python

# functions for segmentation and graphics associated
def load_model(type_of_activity, model):
"""
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
Args:
- type_of_activity (str)
- model (str)
Returns:
Model: machine learning model pre-trained with a scikit learn pipeline.
"""
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def load_test_file(type_of_activity):
"""
Load the test dataset from S3 storage for the type of activity specified.
Args:
- type_of_activity (str)
Returns:
DataFrame: Test dataset.
"""
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test
def save_file_s3_mp(File_name, type_of_activity):
"""
Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified.
Args:
- File_name (str)
- type_of_activity (str)
Returns:
None
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=110)
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def save_txt_file_s3(file_name, type_of_activity, content):
"""
Save a text file to S3 storage to the location assigned for the type of activity specified.
Args:
- file_name (str)
- type_of_activity (str)
- content (str)
Returns:
None
"""
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
s3_file.write(content)
def df_business_fig(df, segment, list_var) :
"""
Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
Args:
- df (DataFrame): The DataFrame containing data.
- segment (str): The column name representing segments.
- list_var (list of str): The list of variable names to be aggregated.
Returns:
DataFrame: The DataFrame containing business KPIs.
"""
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
return df_business_kpi
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
"""
Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
Args:
- df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
- segment (str): The column name representing segments.
- size (str): The column name representing the size.
- nb_tickets (str): The column name representing the number of tickets.
- nb_purchases (str): The column name representing the number of purchases.
- total_amount (str): The column name representing the total amount.
- nb_campaigns (str): The column name representing the number of campaigns.
- type_of_activity (str)
Returns:
None
"""
plt.figure()
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
"number of\ncampaigns"]
bottom = np.zeros(5)
# types of blue color
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
for i in range(4) :
height = list(df_plot.loc[i,size:].values)
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
bottom+=height
# Ajust margins
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
plt.ylabel("Fraction represented by the segment (%)")
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
# plt.show()
# def df_segment_mp(df) :
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
# return df_mp
# def df_segment_pb (df) :
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# return df_pb
def radar_mp_plot(df, categories, index) :
"""
Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
Args:
- df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
- categories (list of str):
- index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
Returns:
None
"""
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# Initialize graphic
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
# fill the sector
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
# labels
ax.set_yticklabels([])
ax.set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax.set_xticklabels(ticks, color="black")
ax.spines['polar'].set_visible(False)
plt.title(f'Characteristics of the segment {index+1}\n')
# plt.show()
def radar_mp_plot_all(df, type_of_activity) :
"""
Plot exactly the same radar charts as radar_mp_plot, but for all segments.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
# table summarizing variables relative to marketing personae
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
#df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))
# table relative to purchasing behaviour
df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# concatenation of tables to prepare the plot
df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)
# rename columns for the plot
df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})
# visualization
nb_segments = df_used.shape[0]
categories = list(df_used.drop("segment", axis=1).columns)
var_not_perc = ["age"]
# Initialize graphic
fig, ax = plt.subplots(2,2, figsize=(20, 21), subplot_kw=dict(polar=True))
for index in range(nb_segments) :
row = index // 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
# true values are used to print the true value in parenthesis
tvalues = list(df_used.loc[index,categories])
max_values = df_used[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df_used.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
# fill the sector
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
# labels
ax[row, col].set_yticklabels([])
ax[row, col].set_xticks(angles)
# define the ticks
values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]
ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
ax[row, col].spines['polar'].set_visible(False)
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
plt.tight_layout()
# plt.show()
def known_sociodemo_caracteristics(df, type_of_activity) :
"""
Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
# Arrondir les valeurs du DataFrame à une décimale
table_share_known_rounded = table_share_known.round(1)
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
latex_table = latex_table.replace('%', '\\%')
save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)