2024-03-31 18:35:58 +02:00
### importations ###
2024-03-20 13:07:24 +01:00
2024-03-31 18:35:58 +02:00
### functions for segmentation and graphics associated ###
2024-03-20 13:07:24 +01:00
def load_model ( type_of_activity , model ) :
2024-03-31 18:35:58 +02:00
BUCKET = f " projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/ { type_of_activity } / { model } / "
2024-03-20 13:07:24 +01:00
filename = model + ' .pkl '
file_path = BUCKET + filename
with fs . open ( file_path , mode = " rb " ) as f :
model_bytes = f . read ( )
model = pickle . loads ( model_bytes )
return model
2024-03-20 14:07:33 +01:00
def load_test_file ( type_of_activity ) :
2024-03-31 18:35:58 +02:00
file_path_test = f " projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/ { type_of_activity } /Test_set.csv "
2024-03-20 14:07:33 +01:00
with fs . open ( file_path_test , mode = " rb " ) as file_in :
dataset_test = pd . read_csv ( file_in , sep = " , " )
return dataset_test
2024-03-31 18:35:58 +02:00
def save_file_s3_mp ( File_name , type_of_activity ) :
image_buffer = io . BytesIO ( )
plt . savefig ( image_buffer , format = ' png ' , dpi = 110 )
image_buffer . seek ( 0 )
PATH = f " projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/ { type_of_activity } / "
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + ' .png '
with fs . open ( FILE_PATH_OUT_S3 , ' wb ' ) as s3_file :
s3_file . write ( image_buffer . read ( ) )
plt . close ( )
def save_txt_file_s3 ( file_name , type_of_activity , content ) :
FILE_PATH = f " projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/ { type_of_activity } / "
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + ' .txt '
with fs . open ( FILE_PATH_OUT_S3 , ' w ' ) as s3_file :
s3_file . write ( content )
def df_business_fig ( df , segment , list_var ) :
df_business_kpi = df . groupby ( segment ) [ list_var ] . sum ( ) . reset_index ( )
df_business_kpi . insert ( 1 , " size " , df . groupby ( segment ) . size ( ) . values )
all_var = [ " size " ] + list_var
df_business_kpi [ all_var ] = 100 * df_business_kpi [ all_var ] / df_business_kpi [ all_var ] . sum ( )
return df_business_kpi
def hist_segment_business_KPIs ( df , segment , size , nb_tickets , nb_purchases , total_amount , nb_campaigns , type_of_activity ) :
plt . figure ( )
df_plot = df [ [ segment , size , nb_tickets , nb_purchases , total_amount , nb_campaigns ] ]
x = [ " number of \n customers " , " number of \n tickets " , " number of \n purchases " , " total \n amount " ,
" number of \n campaigns " ]
bottom = np . zeros ( 5 )
# types of blue color
colors = plt . cm . Blues ( np . linspace ( 0.1 , 0.9 , 4 ) )
for i in range ( 4 ) :
height = list ( df_plot . loc [ i , size : ] . values )
plt . bar ( x = x , height = height , label = str ( df_plot [ segment ] [ i ] ) , bottom = bottom , color = colors [ i ] )
bottom + = height
# Ajust margins
plt . subplots_adjust ( left = 0.125 , right = 0.8 , bottom = 0.1 , top = 0.9 )
plt . legend ( title = " segment " , loc = " upper right " , bbox_to_anchor = ( 1.2 , 1 ) )
plt . ylabel ( " Fraction represented by the segment ( % ) " )
plt . title ( f " Relative weight of each segment regarding business KPIs \n for { type_of_activity } companies " , size = 12 )
# plt.show()
# def df_segment_mp(df) :
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
# return df_mp
# def df_segment_pb (df) :
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# return df_pb
def radar_mp_plot ( df , categories , index ) :
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list ( df . loc [ index , categories ] )
max_values = df [ categories ] . max ( )
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list ( df . loc [ index , categories ] / max_values )
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max ( values ) * elt for elt in values ]
# Nb of categories
num_categories = len ( categories )
angles = np . linspace ( 0 , 2 * np . pi , num_categories , endpoint = False ) . tolist ( )
# Initialize graphic
fig , ax = plt . subplots ( figsize = ( 6 , 6 ) , subplot_kw = dict ( polar = True ) )
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax . plot ( angles + angles [ : 1 ] , values + values [ : 1 ] , color = ' skyblue ' , alpha = 0 , linewidth = 1.5 )
ax . plot ( angles + angles [ : 1 ] , values_normalized + values_normalized [ : 1 ] , color = ' black ' , alpha = 0.5 , linewidth = 1.2 )
# fill the sector
ax . fill ( angles , values_normalized , color = ' orange ' , alpha = 0.4 )
# labels
ax . set_yticklabels ( [ ] )
ax . set_xticks ( angles )
ticks = [ categories [ i ] . replace ( " _ " , " " ) + f " \n ( { round ( 100 * tvalues [ i ] , 2 ) } %) " for i in range ( len ( categories ) ) ]
ax . set_xticklabels ( ticks , color = " black " )
ax . spines [ ' polar ' ] . set_visible ( False )
plt . title ( f ' Characteristics of the segment { index + 1 } \n ' )
# plt.show()
def radar_mp_plot_all ( df , type_of_activity ) :
# table summarizing variables relative to marketing personae
2024-03-31 23:59:52 +02:00
df_mp = df . groupby ( " segment " ) [ [ " gender_female " , " gender_male " , " gender_other " , " age " ] ] . mean ( ) . reset_index ( )
#df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
df_mp . insert ( 4 , " share_of_women " , df_mp [ " gender_female " ] / ( df_mp [ " gender_female " ] + df_mp [ " gender_male " ] ) )
2024-03-31 18:35:58 +02:00
# table relative to purchasing behaviour
df_pb = df . groupby ( " segment " ) [ [ " prop_purchases_internet " , " taux_ouverture_mail " , " opt_in " ] ] . mean ( ) . reset_index ( )
# concatenation of tables to prepare the plot
2024-03-31 23:59:52 +02:00
df_used = pd . concat ( [ df_pb , df_mp [ [ ' share_of_women ' , ' age ' ] ] ] , axis = 1 )
# rename columns for the plot
df_used = df_used . rename ( columns = { ' taux_ouverture_mail ' : ' mails_opened ' , ' prop_purchases_internet ' : ' purchases_internet ' } )
2024-03-31 18:35:58 +02:00
# visualization
nb_segments = df_used . shape [ 0 ]
categories = list ( df_used . drop ( " segment " , axis = 1 ) . columns )
2024-03-31 23:59:52 +02:00
var_not_perc = [ " age " ]
2024-03-31 18:35:58 +02:00
# Initialize graphic
2024-04-02 13:36:34 +02:00
fig , ax = plt . subplots ( 2 , 2 , figsize = ( 20 , 22 ) , subplot_kw = dict ( polar = True ) )
2024-03-31 18:35:58 +02:00
for index in range ( nb_segments ) :
row = index / / 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
# true values are used to print the true value in parenthesis
tvalues = list ( df_used . loc [ index , categories ] )
max_values = df_used [ categories ] . max ( )
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list ( df_used . loc [ index , categories ] / max_values )
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max ( values ) * elt for elt in values ]
# Nb of categories
num_categories = len ( categories )
angles = np . linspace ( 0 , 2 * np . pi , num_categories , endpoint = False ) . tolist ( )
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
ax [ row , col ] . plot ( angles + angles [ : 1 ] , values + values [ : 1 ] , color = ' skyblue ' , alpha = 0 , linewidth = 1.5 )
ax [ row , col ] . plot ( angles + angles [ : 1 ] , values_normalized + values_normalized [ : 1 ] , color = ' black ' , alpha = 0.5 ,
linewidth = 1.2 )
# fill the sector
ax [ row , col ] . fill ( angles , values_normalized , color = ' orange ' , alpha = 0.4 , label = index )
# labels
ax [ row , col ] . set_yticklabels ( [ ] )
ax [ row , col ] . set_xticks ( angles )
2024-03-31 23:59:52 +02:00
# define the ticks
values_printed = [ str ( round ( tvalues [ i ] , 2 ) ) if categories [ i ] in var_not_perc else f " { round ( 100 * tvalues [ i ] , 2 ) } % " for i in range ( len ( categories ) ) ]
ticks = [ categories [ i ] . replace ( " _ " , " " ) + f " \n ( { values_printed [ i ] } ) " for i in range ( len ( categories ) ) ]
2024-03-31 18:35:58 +02:00
ax [ row , col ] . set_xticklabels ( ticks , color = " black " , size = 20 )
ax [ row , col ] . spines [ ' polar ' ] . set_visible ( False )
ax [ row , col ] . set_title ( f ' Segment { index + 1 } \n ' , size = 24 )
fig . suptitle ( f " Characteristics of marketing personae of { type_of_activity } companies " , size = 32 )
2024-03-31 23:59:52 +02:00
plt . tight_layout ( )
2024-03-31 18:35:58 +02:00
# plt.show()
def known_sociodemo_caracteristics ( df , type_of_activity ) :
table_share_known = df . groupby ( " segment " ) [ [ " is_profession_known " , " is_zipcode_known " , " categorie_age_inconnue " , " gender_other " ] ] . mean ( ) . mul ( 100 ) . reset_index ( )
table_share_known . columns = [ ' Segment ' , ' Share of Known Profession ( % ) ' , ' Share of Known Zipcode ( % ) ' , ' Share of Unknown Age ( % ) ' , ' Share of Unknown Gender ( % ) ' ]
table_share_known = table_share_known . pivot_table ( index = None , columns = ' Segment ' )
# Arrondir les valeurs du DataFrame à une décimale
table_share_known_rounded = table_share_known . round ( 1 )
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
latex_table = tabulate ( table_share_known_rounded , headers = ' keys ' , tablefmt = ' latex_raw ' , floatfmt = " .1f " )
latex_table = latex_table . replace ( ' % ' , ' \\ % ' )
save_txt_file_s3 ( " table_known_socio_demo_caracteristics " , type_of_activity , latex_table )