BDC-team-1/utils_segmentation_V2TP.py

### importations ###

import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt


### functions for segmentation and graphics associated ###

def load_model(type_of_activity, model):
    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f:
        model_bytes = f.read()

    model = pickle.loads(model_bytes)
    return model


def load_test_file(type_of_activity):
    file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
    with fs.open(file_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
    return dataset_test


def save_file_s3_mp(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    PATH = f"projet-bdc2324-team1/Output_marketing_personae_analysis/{type_of_activity}/"
    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()


def df_business_fig(df, segment, list_var) :
    df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
    df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
    all_var = ["size"] + list_var
    df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()

    return df_business_kpi


def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns) :

    plt.figure()

    df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]

    x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
         "number of\ncampaigns"]

    bottom = np.zeros(5)

    # types of blue color
    colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))

    for i in range(4) :
        height = list(df_plot.loc[i,size:].values)
        plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
        bottom+=height

    # Ajust margins
    plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)

    plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
    plt.ylabel("Fraction represented by the segment (%)")
    plt.title("Relative weight of each segment regarding business KPIs")
    # plt.show()


def df_segment_mp(df, segment, gender_female, gender_male, gender_other, country_fr) :
    df_mp = df.groupby(segment)[[gender_female, gender_male, gender_other, country_fr]].mean().reset_index()
    df_mp.insert(3, "share_known_gender", X_test_segment_mp[gender_female]+X_test_segment_mp[gender_male])
    df_mp.insert(4, "share_of_women", X_test_segment_mp[gender_female]/(X_test_segment_mp["share_known_gender"]))
    return df_mp


def df_segment_pb (df, segment, nb_tickets_internet, nb_tickets, nb_campaigns_opened, nb_campaigns, opt_in) :
    df_used = df
    df_used["share_tickets_internet"] = df_used[nb_tickets_internet]/df_used[nb_tickets]
    df_used["share_campaigns_opened"] = df_used[nb_campaigns_opened]/df_used[nb_campaigns]
    df_pb = df_used.groupby(segment)[["share_tickets_internet", "share_campaigns_opened", opt_in]].mean().reset_index()
    return df_pb


def radar_mp_plot(df, categories, index) :
    categories = categories

    # true values are used to print the true value in parenthesis
    tvalues = list(df.loc[index,categories])

    max_values = df[categories].max()

    # values are true values / max among the 4 segments, allows to
    # put values in relation with the values for other segments
    # if the point has a maximal abscisse it means that value is maximal for the segment considered
    # , event if not equal to 1

    values = list(df.loc[index,categories]/max_values)

    # values normalized are used to adjust the value around the circle
    # for instance if the maximum of values is equal to 0.8, we want the point to be
    # at 8/10th of the circle radius, not at the edge
    values_normalized = [ max(values) * elt for elt in values]

    # Nb of categories
    num_categories = len(categories)

    angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()

    # Initialize graphic
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

    # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
    # which is based on max(value)
    ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
    ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)

    # fill the sector
    ax.fill(angles, values_normalized, color='orange', alpha=0.4)

    # labels
    ax.set_yticklabels([])
    ax.set_xticks(angles)
    ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
    ax.set_xticklabels(ticks, color="black")

    ax.spines['polar'].set_visible(False)

    plt.title(f'Characteristics of the segment {index+1}\n')

    # plt.show()


def radar_mp_plot_all(df, categories) :

    nb_segments = df.shape[0]
    categories = categories

    # Initialize graphic
    fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True))

    for index in range(nb_segments) :
        row = index // 2  # Division entière pour obtenir le numéro de ligne
        col = index % 2

        df = X_test_segment_caract

        # true values are used to print the true value in parenthesis
        tvalues = list(df.loc[index,categories])

        max_values = df[categories].max()

        # values are true values / max among the 4 segments, allows to
        # put values in relation with the values for other segments
        # if the point has a maximal abscisse it means that value is maximal for the segment considered
        # , event if not equal to 1

        values = list(df.loc[index,categories]/max_values)

        # values normalized are used to adjust the value around the circle
        # for instance if the maximum of values is equal to 0.8, we want the point to be
        # at 8/10th of the circle radius, not at the edge
        values_normalized = [ max(values) * elt for elt in values]

        # Nb of categories
        num_categories = len(categories)

        angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()

        # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
        # which is based on max(value)
        ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
        ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
                          linewidth=1.2)

        # fill the sector
        ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)

        # labels
        ax[row, col].set_yticklabels([])
        ax[row, col].set_xticks(angles)
        ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
        ax[row, col].set_xticklabels(ticks, color="black", size = 20)

        ax[row, col].spines['polar'].set_visible(False)

        ax[row, col].set_title(f'Characteristics of the segment {index+1}\n', size = 24)

    # plt.show()