BDC-team-1/utils_segmentation.py

### importations ###


### functions for segmentation and graphics associated ###

def load_model(type_of_activity, model):
    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f:
        model_bytes = f.read()

    model = pickle.loads(model_bytes)
    return model


def load_test_file(type_of_activity):
    file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
    with fs.open(file_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
    return dataset_test


def save_file_s3_mp(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png', dpi=110)
    image_buffer.seek(0)
    PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()

def save_txt_file_s3(file_name, type_of_activity, content):
    FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
    FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
    with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
        s3_file.write(content)
        
def df_business_fig(df, segment, list_var) :
    df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
    df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
    all_var = ["size"] + list_var
    df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()

    return df_business_kpi


def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
    
    plt.figure()

    df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
    
    x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount", 
         "number of\ncampaigns"]
    
    bottom = np.zeros(5)
    
    # types of blue color
    colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
    
    for i in range(4) :
        height = list(df_plot.loc[i,size:].values)
        plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
        bottom+=height

    # Ajust margins
    plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
   
    plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
    plt.ylabel("Fraction represented by the segment (%)")
    plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
    # plt.show()


# def df_segment_mp(df) :
#     df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
#     df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
#     df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
#     return df_mp


# def df_segment_pb (df) :
#     df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
#     return df_pb


def radar_mp_plot(df, categories, index) :
    categories = categories

    # true values are used to print the true value in parenthesis
    tvalues = list(df.loc[index,categories]) 

    max_values = df[categories].max()

    # values are true values / max among the 4 segments, allows to 
    # put values in relation with the values for other segments
    # if the point has a maximal abscisse it means that value is maximal for the segment considered
    # , event if not equal to 1
    
    values = list(df.loc[index,categories]/max_values)
                  
    # values normalized are used to adjust the value around the circle
    # for instance if the maximum of values is equal to 0.8, we want the point to be 
    # at 8/10th of the circle radius, not at the edge 
    values_normalized = [ max(values) * elt for elt in values]

    # Nb of categories
    num_categories = len(categories)

    angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
    
    # Initialize graphic
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    
    # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
    # which is based on max(value)
    ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
    ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
    
    # fill the sector
    ax.fill(angles, values_normalized, color='orange', alpha=0.4)
    
    # labels
    ax.set_yticklabels([])
    ax.set_xticks(angles)
    ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
    ax.set_xticklabels(ticks, color="black")
    
    ax.spines['polar'].set_visible(False)
    
    plt.title(f'Characteristics of the segment {index+1}\n')
    
    # plt.show()


def radar_mp_plot_all(df, type_of_activity) :
    
    # table summarizing variables relative to marketing personae
    df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
    #df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
    df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))

    # table relative to purchasing behaviour
    df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()

    # concatenation of tables to prepare the plot
    df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)

    # rename columns for the plot
    df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})

    # visualization
    nb_segments = df_used.shape[0]
    categories = list(df_used.drop("segment", axis=1).columns)

    var_not_perc = ["age"]

    # Initialize graphic
    fig, ax = plt.subplots(2,2, figsize=(20, 22), subplot_kw=dict(polar=True))
    
    for index in range(nb_segments) :
        row = index // 2  # Division entière pour obtenir le numéro de ligne
        col = index % 2 
    
        # true values are used to print the true value in parenthesis
        tvalues = list(df_used.loc[index,categories]) 
        
        max_values = df_used[categories].max()
        
        # values are true values / max among the 4 segments, allows to 
        # put values in relation with the values for other segments
        # if the point has a maximal abscisse it means that value is maximal for the segment considered
        # , event if not equal to 1

        values = list(df_used.loc[index,categories]/max_values)
                          
        # values normalized are used to adjust the value around the circle
        # for instance if the maximum of values is equal to 0.8, we want the point to be 
        # at 8/10th of the circle radius, not at the edge 
        values_normalized = [ max(values) * elt for elt in values]
        
        # Nb of categories
        num_categories = len(categories)
    
        angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
    
        # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
        # which is based on max(value)
        ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
        ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
                          linewidth=1.2)
        
        # fill the sector
        ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
        
        # labels
        ax[row, col].set_yticklabels([])
        ax[row, col].set_xticks(angles)

        # define the ticks
        values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]       
        ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]
        ax[row, col].set_xticklabels(ticks, color="black", size = 20)

        ax[row, col].spines['polar'].set_visible(False)
        
        ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
        
    fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)

    plt.tight_layout()
    # plt.show()

def known_sociodemo_caracteristics(df, type_of_activity) :
    
    table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
    table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
    table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
    
    # Arrondir les valeurs du DataFrame à une décimale
    table_share_known_rounded = table_share_known.round(1)
    
    # Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
    latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
    latex_table = latex_table.replace('%', '\\%')

    save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)
Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`### importations ###`
commit segmentation 2024-03-20 13:07:24 +01:00

Ajout description marketing personae 2024-03-31 18:35:58 +02:00

			`### functions for segmentation and graphics associated ###`

commit segmentation 2024-03-20 13:07:24 +01:00			`def load_model(type_of_activity, model):`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"`
commit segmentation 2024-03-20 13:07:24 +01:00			`filename = model + '.pkl'`
			`file_path = BUCKET + filename`
			`with fs.open(file_path, mode="rb") as f:`
			`model_bytes = f.read()`

			`model = pickle.loads(model_bytes)`
			`return model`

add probability 2024-03-20 14:07:33 +01:00
			`def load_test_file(type_of_activity):`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"`
add probability 2024-03-20 14:07:33 +01:00			`with fs.open(file_path_test, mode="rb") as file_in:`
			`dataset_test = pd.read_csv(file_in, sep=",")`
			`return dataset_test`

Ajout description marketing personae 2024-03-31 18:35:58 +02:00
			`def save_file_s3_mp(File_name, type_of_activity):`
			`image_buffer = io.BytesIO()`
			`plt.savefig(image_buffer, format='png', dpi=110)`
			`image_buffer.seek(0)`
			`PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"`
			`FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'`
			`with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:`
			`s3_file.write(image_buffer.read())`
			`plt.close()`

			`def save_txt_file_s3(file_name, type_of_activity, content):`
			`FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"`
			`FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'`
			`with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:`
			`s3_file.write(content)`

			`def df_business_fig(df, segment, list_var) :`
			`df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()`
			`df_business_kpi.insert(1, "size", df.groupby(segment).size().values)`
			`all_var = ["size"] + list_var`
			`df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()`

			`return df_business_kpi`


			`def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :`

			`plt.figure()`

			`df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]`

			`x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",`
			`"number of\ncampaigns"]`

			`bottom = np.zeros(5)`

			`# types of blue color`
			`colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))`

			`for i in range(4) :`
			`height = list(df_plot.loc[i,size:].values)`
			`plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])`
			`bottom+=height`

			`# Ajust margins`
			`plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)`

			`plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))`
			`plt.ylabel("Fraction represented by the segment (%)")`
			`plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)`
			`# plt.show()`


			`# def df_segment_mp(df) :`
			`# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()`
			`# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])`
			`# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))`
			`# return df_mp`


			`# def df_segment_pb (df) :`
			`# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()`
			`# return df_pb`


			`def radar_mp_plot(df, categories, index) :`
			`categories = categories`

			`# true values are used to print the true value in parenthesis`
			`tvalues = list(df.loc[index,categories])`

			`max_values = df[categories].max()`

			`# values are true values / max among the 4 segments, allows to`
			`# put values in relation with the values for other segments`
			`# if the point has a maximal abscisse it means that value is maximal for the segment considered`
			`# , event if not equal to 1`

			`values = list(df.loc[index,categories]/max_values)`

			`# values normalized are used to adjust the value around the circle`
			`# for instance if the maximum of values is equal to 0.8, we want the point to be`
			`# at 8/10th of the circle radius, not at the edge`
			`values_normalized = [ max(values) * elt for elt in values]`

			`# Nb of categories`
			`num_categories = len(categories)`

			`angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()`

			`# Initialize graphic`
			`fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))`

			`# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle`
			`# which is based on max(value)`
			`ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)`
			`ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)`

			`# fill the sector`
			`ax.fill(angles, values_normalized, color='orange', alpha=0.4)`

			`# labels`
			`ax.set_yticklabels([])`
			`ax.set_xticks(angles)`
			`ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]`
			`ax.set_xticklabels(ticks, color="black")`

			`ax.spines['polar'].set_visible(False)`

			`plt.title(f'Characteristics of the segment {index+1}\n')`

			`# plt.show()`


			`def radar_mp_plot_all(df, type_of_activity) :`

			`# table summarizing variables relative to marketing personae`
final changes for spider charts 2024-03-31 23:59:52 +02:00			`df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()`
			`#df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])`
			`df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00
			`# table relative to purchasing behaviour`
			`df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()`

			`# concatenation of tables to prepare the plot`
final changes for spider charts 2024-03-31 23:59:52 +02:00			`df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)`

			`# rename columns for the plot`
			`df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00
			`# visualization`
			`nb_segments = df_used.shape[0]`
			`categories = list(df_used.drop("segment", axis=1).columns)`

final changes for spider charts 2024-03-31 23:59:52 +02:00			`var_not_perc = ["age"]`

Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`# Initialize graphic`
minor change : adjusted size of spider chart 2024-04-02 13:36:34 +02:00			`fig, ax = plt.subplots(2,2, figsize=(20, 22), subplot_kw=dict(polar=True))`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00
			`for index in range(nb_segments) :`
			`row = index // 2 # Division entière pour obtenir le numéro de ligne`
			`col = index % 2`

			`# true values are used to print the true value in parenthesis`
			`tvalues = list(df_used.loc[index,categories])`

			`max_values = df_used[categories].max()`

			`# values are true values / max among the 4 segments, allows to`
			`# put values in relation with the values for other segments`
			`# if the point has a maximal abscisse it means that value is maximal for the segment considered`
			`# , event if not equal to 1`

			`values = list(df_used.loc[index,categories]/max_values)`

			`# values normalized are used to adjust the value around the circle`
			`# for instance if the maximum of values is equal to 0.8, we want the point to be`
			`# at 8/10th of the circle radius, not at the edge`
			`values_normalized = [ max(values) * elt for elt in values]`

			`# Nb of categories`
			`num_categories = len(categories)`

			`angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()`

			`# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle`
			`# which is based on max(value)`
			`ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)`
			`ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,`
			`linewidth=1.2)`

			`# fill the sector`
			`ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)`

			`# labels`
			`ax[row, col].set_yticklabels([])`
			`ax[row, col].set_xticks(angles)`
final changes for spider charts 2024-03-31 23:59:52 +02:00
			`# define the ticks`
			`values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]`
			`ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`ax[row, col].set_xticklabels(ticks, color="black", size = 20)`

			`ax[row, col].spines['polar'].set_visible(False)`

			`ax[row, col].set_title(f'Segment {index+1}\n', size = 24)`

			`fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)`
final changes for spider charts 2024-03-31 23:59:52 +02:00
			`plt.tight_layout()`
Ajout description marketing personae 2024-03-31 18:35:58 +02:00			`# plt.show()`

			`def known_sociodemo_caracteristics(df, type_of_activity) :`

			`table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()`
			`table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']`
			`table_share_known= table_share_known.pivot_table(index=None, columns='Segment')`

			`# Arrondir les valeurs du DataFrame à une décimale`
			`table_share_known_rounded = table_share_known.round(1)`

			`# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'`
			`latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")`
			`latex_table = latex_table.replace('%', '\\%')`

			`save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)`