Ajout description marketing personae
This commit is contained in:
		
							parent
							
								
									7341752be0
								
							
						
					
					
						commit
						8e61e9d2a4
					
				|  | @ -1,40 +0,0 @@ | |||
| import pandas as pd | ||||
| import numpy as np | ||||
| import os | ||||
| import io | ||||
| import s3fs | ||||
| import re | ||||
| import pickle | ||||
| import warnings | ||||
| 
 | ||||
| 
 | ||||
| exec(open('utils_segmentation.py').read()) | ||||
| warnings.filterwarnings('ignore') | ||||
| 
 | ||||
| # Create filesystem object | ||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||
| fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | ||||
| 
 | ||||
| # choose the type of companies for which you want to run the pipeline | ||||
| type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||
| 
 | ||||
| # load test set | ||||
| dataset_test = load_test_file(type_of_activity) | ||||
| 
 | ||||
| # Load Model  | ||||
| model = load_model(type_of_activity, 'LogisticRegression_Benchmark') | ||||
| 
 | ||||
| # Processing | ||||
| X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',  | ||||
|             'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner', | ||||
|             'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']] | ||||
| 
 | ||||
| y_test = dataset_test[['y_has_purchased']] | ||||
| 
 | ||||
| # Prediction | ||||
| y_pred_prob = model.predict_proba(X_test)[:, 1] | ||||
| 
 | ||||
| # Add probability to dataset_test | ||||
| dataset_test['Probability_to_buy'] = y_pred_prob | ||||
| print('probability added to dataset_test') | ||||
| print(dataset_test.head()) | ||||
|  | @ -1,99 +0,0 @@ | |||
| ### importations ### | ||||
| ### not necesary ?? As we exec the utils .py file associated  | ||||
| 
 | ||||
| """ | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import os | ||||
| import io | ||||
| import s3fs | ||||
| import re | ||||
| import pickle | ||||
| import warnings | ||||
| import matplotlib.pyplot as plt | ||||
| """ | ||||
| 
 | ||||
| ### --- beginning of the code --- ### | ||||
| 
 | ||||
| 
 | ||||
| ### hyperparameters of the code ### | ||||
| 
 | ||||
| ################################### | ||||
| 
 | ||||
| # choose the type of companies for which you want to run the pipeline | ||||
| type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||
| 
 | ||||
| # choose the model we use for the segmentation | ||||
| model_name = "LogisticRegression_Benchmark" | ||||
| 
 | ||||
| ################################### | ||||
| 
 | ||||
| 
 | ||||
| # execute file including functions we need | ||||
| exec(open('utils_segmentation_V2TP.py').read()) | ||||
| 
 | ||||
| warnings.filterwarnings('ignore') | ||||
| 
 | ||||
| # Create filesystem object | ||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||
| fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | ||||
| 
 | ||||
| # load test set | ||||
| dataset_test = load_test_file(type_of_activity) | ||||
| 
 | ||||
| # Load Model  | ||||
| model = load_model(type_of_activity, model_name) | ||||
| 
 | ||||
| 
 | ||||
| ### Preprocessing of data | ||||
| X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',  | ||||
|             'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner', | ||||
|             'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr']] | ||||
| 
 | ||||
| y_test = dataset_test[['y_has_purchased']] | ||||
| 
 | ||||
| X_test_segment = X_test | ||||
| 
 | ||||
| # add y_has_purchased to X_test | ||||
| X_test_segment["has_purchased"] = y_test | ||||
| 
 | ||||
| # Add prediction and probability to dataset_test | ||||
| y_pred = model.predict(X_test) | ||||
| X_test_segment["has_purchased_estim"] = y_pred | ||||
| 
 | ||||
| y_pred_prob = model.predict_proba(X_test)[:, 1] | ||||
| X_test_segment['score'] = y_pred_prob | ||||
| 
 | ||||
| X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1', | ||||
|                    np.where(X_test_segment['score']<0.5, '2', | ||||
|                    np.where(X_test_segment['score']<0.75, '3', '4'))) | ||||
| 
 | ||||
| ### 1. business KPIs  | ||||
| 
 | ||||
| business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"] | ||||
| X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var) | ||||
| 
 | ||||
| # save histogram to Minio | ||||
| hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",  | ||||
|                            "nb_purchases", "total_amount", "nb_campaigns") | ||||
| save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity) | ||||
| 
 | ||||
| 
 | ||||
| ### 2. description of marketing personae (spider chart) | ||||
| 
 | ||||
| # table summarizing variables relative to marketing personae | ||||
| X_test_segment_mp = df_segment_mp(X_test_segment, "segment", "gender_female",  | ||||
|                                   "gender_male", "gender_other", "country_fr") | ||||
| 
 | ||||
| # table relative to purchasing behaviour | ||||
| X_test_segment_pb = df_segment_pb(X_test_segment, "segment", "nb_tickets_internet", "nb_tickets",  | ||||
|                                   "nb_campaigns_opened", "nb_campaigns", "opt_in") | ||||
| 
 | ||||
| # concatenation of tables to prepare the plot | ||||
| X_test_segment_caract = pd.concat([X_test_segment_pb, X_test_segment_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1) | ||||
| 
 | ||||
| # visualization and save the graphic to the MinIo | ||||
| categories = list(X_test_segment_caract.drop("segment", axis=1).columns) | ||||
| radar_mp_plot_all(df=X_test_segment_caract, categories=categories) | ||||
| save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity) | ||||
| 
 | ||||
							
								
								
									
										82
									
								
								6_Segmentation_and_Marketing_Personae.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								6_Segmentation_and_Marketing_Personae.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,82 @@ | |||
| 
 | ||||
| # Packages | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import os | ||||
| import io | ||||
| import s3fs | ||||
| import re | ||||
| import pickle | ||||
| import warnings | ||||
| import matplotlib.pyplot as plt | ||||
| from tabulate import tabulate | ||||
| 
 | ||||
| ################################### | ||||
| 
 | ||||
| # choose the model we use for the segmentation | ||||
| model_name = "LogisticRegression_Benchmark" | ||||
| 
 | ||||
| ################################### | ||||
| 
 | ||||
| 
 | ||||
| # execute file including functions we need | ||||
| exec(open('utils_segmentation.py').read()) | ||||
| 
 | ||||
| warnings.filterwarnings('ignore') | ||||
| 
 | ||||
| # Create filesystem object | ||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||
| fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | ||||
| 
 | ||||
| 
 | ||||
| # choose the type of companies for which you want to run the pipeline | ||||
| # type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||
| for type_of_activity in ['musee', 'sport', 'musique'] :  | ||||
|      | ||||
| 
 | ||||
|     # load test set | ||||
|     dataset_test = load_test_file(type_of_activity) | ||||
|      | ||||
|     # Load Model  | ||||
|     model = load_model(type_of_activity, model_name) | ||||
|      | ||||
|      | ||||
|     ### Preprocessing of data | ||||
|     X_test = dataset_test.drop(columns = 'y_has_purchased') | ||||
|      | ||||
|     y_test = dataset_test[['y_has_purchased']] | ||||
|      | ||||
|     X_test_segment = X_test | ||||
|      | ||||
|     # add y_has_purchased to X_test | ||||
|     X_test_segment["has_purchased"] = y_test | ||||
|      | ||||
|     # Add prediction and probability to dataset_test | ||||
|     y_pred = model.predict(X_test) | ||||
|     X_test_segment["has_purchased_estim"] = y_pred | ||||
|      | ||||
|     y_pred_prob = model.predict_proba(X_test)[:, 1] | ||||
|     X_test_segment['score'] = y_pred_prob | ||||
|      | ||||
|     X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1', | ||||
|                        np.where(X_test_segment['score']<0.5, '2', | ||||
|                        np.where(X_test_segment['score']<0.75, '3', '4'))) | ||||
|      | ||||
|     ### 1. business KPIs  | ||||
|      | ||||
|     business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"] | ||||
|     X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var) | ||||
|      | ||||
|     # save histogram to Minio | ||||
|     hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",  | ||||
|                                "nb_purchases", "total_amount", "nb_campaigns", type_of_activity) | ||||
|     save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity) | ||||
|      | ||||
|      | ||||
|     ### 2. description of marketing personae  | ||||
|     ## A. Spider chart | ||||
|     radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity) | ||||
|     save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity) | ||||
|      | ||||
|     ## B. Latex table | ||||
|     known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity) | ||||
|  | @ -1,15 +1,12 @@ | |||
| import pandas as pd | ||||
| import numpy as np | ||||
| import os | ||||
| import io | ||||
| import s3fs | ||||
| import re | ||||
| import pickle | ||||
| import warnings | ||||
| ### importations ### | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| ### functions for segmentation and graphics associated ### | ||||
| 
 | ||||
| def load_model(type_of_activity, model): | ||||
|     BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" | ||||
|     BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/" | ||||
|     filename = model + '.pkl' | ||||
|     file_path = BUCKET + filename | ||||
|     with fs.open(file_path, mode="rb") as f: | ||||
|  | @ -20,8 +17,207 @@ def load_model(type_of_activity, model): | |||
| 
 | ||||
| 
 | ||||
| def load_test_file(type_of_activity): | ||||
|     file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv" | ||||
|     file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv" | ||||
|     with fs.open(file_path_test, mode="rb") as file_in: | ||||
|         dataset_test = pd.read_csv(file_in, sep=",") | ||||
|     return dataset_test | ||||
| 
 | ||||
| 
 | ||||
| def save_file_s3_mp(File_name, type_of_activity): | ||||
|     image_buffer = io.BytesIO() | ||||
|     plt.savefig(image_buffer, format='png', dpi=110) | ||||
|     image_buffer.seek(0) | ||||
|     PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/" | ||||
|     FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png' | ||||
|     with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: | ||||
|         s3_file.write(image_buffer.read()) | ||||
|     plt.close() | ||||
| 
 | ||||
| def save_txt_file_s3(file_name, type_of_activity, content): | ||||
|     FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/" | ||||
|     FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt' | ||||
|     with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file: | ||||
|         s3_file.write(content) | ||||
|          | ||||
| def df_business_fig(df, segment, list_var) : | ||||
|     df_business_kpi = df.groupby(segment)[list_var].sum().reset_index() | ||||
|     df_business_kpi.insert(1, "size", df.groupby(segment).size().values) | ||||
|     all_var = ["size"] + list_var | ||||
|     df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum() | ||||
| 
 | ||||
|     return df_business_kpi | ||||
| 
 | ||||
| 
 | ||||
| def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) : | ||||
|      | ||||
|     plt.figure() | ||||
| 
 | ||||
|     df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]] | ||||
|      | ||||
|     x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",  | ||||
|          "number of\ncampaigns"] | ||||
|      | ||||
|     bottom = np.zeros(5) | ||||
|      | ||||
|     # types of blue color | ||||
|     colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4)) | ||||
|      | ||||
|     for i in range(4) : | ||||
|         height = list(df_plot.loc[i,size:].values) | ||||
|         plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i]) | ||||
|         bottom+=height | ||||
| 
 | ||||
|     # Ajust margins | ||||
|     plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9) | ||||
|     | ||||
|     plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1)) | ||||
|     plt.ylabel("Fraction represented by the segment (%)") | ||||
|     plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12) | ||||
|     # plt.show() | ||||
| 
 | ||||
| 
 | ||||
| # def df_segment_mp(df) : | ||||
| #     df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index() | ||||
| #     df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"]) | ||||
| #     df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"])) | ||||
| #     return df_mp | ||||
| 
 | ||||
| 
 | ||||
| # def df_segment_pb (df) : | ||||
| #     df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index() | ||||
| #     return df_pb | ||||
| 
 | ||||
| 
 | ||||
| def radar_mp_plot(df, categories, index) : | ||||
|     categories = categories | ||||
| 
 | ||||
|     # true values are used to print the true value in parenthesis | ||||
|     tvalues = list(df.loc[index,categories])  | ||||
| 
 | ||||
|     max_values = df[categories].max() | ||||
| 
 | ||||
|     # values are true values / max among the 4 segments, allows to  | ||||
|     # put values in relation with the values for other segments | ||||
|     # if the point has a maximal abscisse it means that value is maximal for the segment considered | ||||
|     # , event if not equal to 1 | ||||
|      | ||||
|     values = list(df.loc[index,categories]/max_values) | ||||
|                    | ||||
|     # values normalized are used to adjust the value around the circle | ||||
|     # for instance if the maximum of values is equal to 0.8, we want the point to be  | ||||
|     # at 8/10th of the circle radius, not at the edge  | ||||
|     values_normalized = [ max(values) * elt for elt in values] | ||||
| 
 | ||||
|     # Nb of categories | ||||
|     num_categories = len(categories) | ||||
| 
 | ||||
|     angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist() | ||||
|      | ||||
|     # Initialize graphic | ||||
|     fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True)) | ||||
|      | ||||
|     # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle | ||||
|     # which is based on max(value) | ||||
|     ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) | ||||
|     ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2) | ||||
|      | ||||
|     # fill the sector | ||||
|     ax.fill(angles, values_normalized, color='orange', alpha=0.4) | ||||
|      | ||||
|     # labels | ||||
|     ax.set_yticklabels([]) | ||||
|     ax.set_xticks(angles) | ||||
|     ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))] | ||||
|     ax.set_xticklabels(ticks, color="black") | ||||
|      | ||||
|     ax.spines['polar'].set_visible(False) | ||||
|      | ||||
|     plt.title(f'Characteristics of the segment {index+1}\n') | ||||
|      | ||||
|     # plt.show() | ||||
| 
 | ||||
| 
 | ||||
| def radar_mp_plot_all(df, type_of_activity) : | ||||
|      | ||||
|     # table summarizing variables relative to marketing personae | ||||
|     df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index() | ||||
|     df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"]) | ||||
|     df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"])) | ||||
| 
 | ||||
|     # table relative to purchasing behaviour | ||||
|     df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index() | ||||
| 
 | ||||
|     # concatenation of tables to prepare the plot | ||||
|     df_used = pd.concat([df_pb, df_mp[['share_known_gender', 'share_of_women', 'country_fr']]], axis=1) | ||||
| 
 | ||||
|     # visualization | ||||
|     nb_segments = df_used.shape[0] | ||||
|     categories = list(df_used.drop("segment", axis=1).columns) | ||||
| 
 | ||||
|     # Initialize graphic | ||||
|     fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True)) | ||||
|      | ||||
|     for index in range(nb_segments) : | ||||
|         row = index // 2  # Division entière pour obtenir le numéro de ligne | ||||
|         col = index % 2  | ||||
|      | ||||
|         # true values are used to print the true value in parenthesis | ||||
|         tvalues = list(df_used.loc[index,categories])  | ||||
|          | ||||
|         max_values = df_used[categories].max() | ||||
|          | ||||
|         # values are true values / max among the 4 segments, allows to  | ||||
|         # put values in relation with the values for other segments | ||||
|         # if the point has a maximal abscisse it means that value is maximal for the segment considered | ||||
|         # , event if not equal to 1 | ||||
| 
 | ||||
|         values = list(df_used.loc[index,categories]/max_values) | ||||
|                            | ||||
|         # values normalized are used to adjust the value around the circle | ||||
|         # for instance if the maximum of values is equal to 0.8, we want the point to be  | ||||
|         # at 8/10th of the circle radius, not at the edge  | ||||
|         values_normalized = [ max(values) * elt for elt in values] | ||||
|          | ||||
|         # Nb of categories | ||||
|         num_categories = len(categories) | ||||
|      | ||||
|         angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist() | ||||
|      | ||||
|         # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle | ||||
|         # which is based on max(value) | ||||
|         ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) | ||||
|         ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, | ||||
|                           linewidth=1.2) | ||||
|          | ||||
|         # fill the sector | ||||
|         ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index) | ||||
|          | ||||
|         # labels | ||||
|         ax[row, col].set_yticklabels([]) | ||||
|         ax[row, col].set_xticks(angles) | ||||
|         ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))] | ||||
|         ax[row, col].set_xticklabels(ticks, color="black", size = 20) | ||||
| 
 | ||||
|         ax[row, col].spines['polar'].set_visible(False) | ||||
|          | ||||
|         ax[row, col].set_title(f'Segment {index+1}\n', size = 24) | ||||
|          | ||||
|     fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32) | ||||
|     # plt.show() | ||||
| 
 | ||||
| def known_sociodemo_caracteristics(df, type_of_activity) : | ||||
|      | ||||
|     table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index() | ||||
|     table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)'] | ||||
|     table_share_known= table_share_known.pivot_table(index=None, columns='Segment') | ||||
|      | ||||
|     # Arrondir les valeurs du DataFrame à une décimale | ||||
|     table_share_known_rounded = table_share_known.round(1) | ||||
|      | ||||
|     # Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%' | ||||
|     latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f") | ||||
|     latex_table = latex_table.replace('%', '\\%') | ||||
| 
 | ||||
|     save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table) | ||||
|      | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,201 +0,0 @@ | |||
| ### importations ### | ||||
| 
 | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import os | ||||
| import io | ||||
| import s3fs | ||||
| import re | ||||
| import pickle | ||||
| import warnings | ||||
| import matplotlib.pyplot as plt | ||||
| 
 | ||||
| 
 | ||||
| ### functions for segmentation and graphics associated ### | ||||
| 
 | ||||
| def load_model(type_of_activity, model): | ||||
|     BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" | ||||
|     filename = model + '.pkl' | ||||
|     file_path = BUCKET + filename | ||||
|     with fs.open(file_path, mode="rb") as f: | ||||
|         model_bytes = f.read() | ||||
| 
 | ||||
|     model = pickle.loads(model_bytes) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| def load_test_file(type_of_activity): | ||||
|     file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv" | ||||
|     with fs.open(file_path_test, mode="rb") as file_in: | ||||
|         dataset_test = pd.read_csv(file_in, sep=",") | ||||
|     return dataset_test | ||||
| 
 | ||||
| 
 | ||||
| def save_file_s3_mp(File_name, type_of_activity): | ||||
|     image_buffer = io.BytesIO() | ||||
|     plt.savefig(image_buffer, format='png', dpi=110) | ||||
|     image_buffer.seek(0) | ||||
|     PATH = f"projet-bdc2324-team1/Output_marketing_personae_analysis/{type_of_activity}/" | ||||
|     FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png' | ||||
|     with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: | ||||
|         s3_file.write(image_buffer.read()) | ||||
|     plt.close() | ||||
| 
 | ||||
| 
 | ||||
| def df_business_fig(df, segment, list_var) : | ||||
|     df_business_kpi = df.groupby(segment)[list_var].sum().reset_index() | ||||
|     df_business_kpi.insert(1, "size", df.groupby(segment).size().values) | ||||
|     all_var = ["size"] + list_var | ||||
|     df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum() | ||||
| 
 | ||||
|     return df_business_kpi | ||||
| 
 | ||||
| 
 | ||||
| def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns) : | ||||
|      | ||||
|     plt.figure() | ||||
| 
 | ||||
|     df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]] | ||||
|      | ||||
|     x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",  | ||||
|          "number of\ncampaigns"] | ||||
|      | ||||
|     bottom = np.zeros(5) | ||||
|      | ||||
|     # types of blue color | ||||
|     colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4)) | ||||
|      | ||||
|     for i in range(4) : | ||||
|         height = list(df_plot.loc[i,size:].values) | ||||
|         plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i]) | ||||
|         bottom+=height | ||||
| 
 | ||||
|     # Ajust margins | ||||
|     plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9) | ||||
|     | ||||
|     plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1)) | ||||
|     plt.ylabel("Fraction represented by the segment (%)") | ||||
|     plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12) | ||||
|     # plt.show() | ||||
| 
 | ||||
| 
 | ||||
| def df_segment_mp(df, segment, gender_female, gender_male, gender_other, country_fr) : | ||||
|     df_mp = df.groupby(segment)[[gender_female, gender_male, gender_other, country_fr]].mean().reset_index() | ||||
|     df_mp.insert(3, "share_known_gender", df_mp[gender_female]+df_mp[gender_male]) | ||||
|     df_mp.insert(4, "share_of_women", df_mp[gender_female]/(df_mp["share_known_gender"])) | ||||
|     return df_mp | ||||
| 
 | ||||
| 
 | ||||
| def df_segment_pb (df, segment, nb_tickets_internet, nb_tickets, nb_campaigns_opened, nb_campaigns, opt_in) : | ||||
|     df_used = df | ||||
|     df_used["share_tickets_internet"] = df_used[nb_tickets_internet]/df_used[nb_tickets] | ||||
|     df_used["share_campaigns_opened"] = df_used[nb_campaigns_opened]/df_used[nb_campaigns] | ||||
|     df_pb = df_used.groupby(segment)[["share_tickets_internet", "share_campaigns_opened", opt_in]].mean().reset_index() | ||||
|     return df_pb | ||||
| 
 | ||||
| 
 | ||||
| def radar_mp_plot(df, categories, index) : | ||||
|     categories = categories | ||||
| 
 | ||||
|     # true values are used to print the true value in parenthesis | ||||
|     tvalues = list(df.loc[index,categories])  | ||||
| 
 | ||||
|     max_values = df[categories].max() | ||||
| 
 | ||||
|     # values are true values / max among the 4 segments, allows to  | ||||
|     # put values in relation with the values for other segments | ||||
|     # if the point has a maximal abscisse it means that value is maximal for the segment considered | ||||
|     # , event if not equal to 1 | ||||
|      | ||||
|     values = list(df.loc[index,categories]/max_values) | ||||
|                    | ||||
|     # values normalized are used to adjust the value around the circle | ||||
|     # for instance if the maximum of values is equal to 0.8, we want the point to be  | ||||
|     # at 8/10th of the circle radius, not at the edge  | ||||
|     values_normalized = [ max(values) * elt for elt in values] | ||||
| 
 | ||||
|     # Nb of categories | ||||
|     num_categories = len(categories) | ||||
| 
 | ||||
|     angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist() | ||||
|      | ||||
|     # Initialize graphic | ||||
|     fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True)) | ||||
|      | ||||
|     # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle | ||||
|     # which is based on max(value) | ||||
|     ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) | ||||
|     ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2) | ||||
|      | ||||
|     # fill the sector | ||||
|     ax.fill(angles, values_normalized, color='orange', alpha=0.4) | ||||
|      | ||||
|     # labels | ||||
|     ax.set_yticklabels([]) | ||||
|     ax.set_xticks(angles) | ||||
|     ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))] | ||||
|     ax.set_xticklabels(ticks, color="black") | ||||
|      | ||||
|     ax.spines['polar'].set_visible(False) | ||||
|      | ||||
|     plt.title(f'Characteristics of the segment {index+1}\n') | ||||
|      | ||||
|     # plt.show() | ||||
| 
 | ||||
| 
 | ||||
| def radar_mp_plot_all(df, categories) : | ||||
|      | ||||
|     nb_segments = df.shape[0] | ||||
|     categories = categories | ||||
| 
 | ||||
|     # Initialize graphic | ||||
|     fig, ax = plt.subplots(2,2, figsize=(25, 20), subplot_kw=dict(polar=True)) | ||||
|      | ||||
|     for index in range(nb_segments) : | ||||
|         row = index // 2  # Division entière pour obtenir le numéro de ligne | ||||
|         col = index % 2  | ||||
|      | ||||
|         # true values are used to print the true value in parenthesis | ||||
|         tvalues = list(df.loc[index,categories])  | ||||
|          | ||||
|         max_values = df[categories].max() | ||||
|          | ||||
|         # values are true values / max among the 4 segments, allows to  | ||||
|         # put values in relation with the values for other segments | ||||
|         # if the point has a maximal abscisse it means that value is maximal for the segment considered | ||||
|         # , event if not equal to 1 | ||||
| 
 | ||||
|         values = list(df.loc[index,categories]/max_values) | ||||
|                            | ||||
|         # values normalized are used to adjust the value around the circle | ||||
|         # for instance if the maximum of values is equal to 0.8, we want the point to be  | ||||
|         # at 8/10th of the circle radius, not at the edge  | ||||
|         values_normalized = [ max(values) * elt for elt in values] | ||||
|          | ||||
|         # Nb of categories | ||||
|         num_categories = len(categories) | ||||
|      | ||||
|         angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist() | ||||
|      | ||||
|         # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle | ||||
|         # which is based on max(value) | ||||
|         ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) | ||||
|         ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, | ||||
|                           linewidth=1.2) | ||||
|          | ||||
|         # fill the sector | ||||
|         ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index) | ||||
|          | ||||
|         # labels | ||||
|         ax[row, col].set_yticklabels([]) | ||||
|         ax[row, col].set_xticks(angles) | ||||
|         ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))] | ||||
|         ax[row, col].set_xticklabels(ticks, color="black", size = 20) | ||||
| 
 | ||||
|         ax[row, col].spines['polar'].set_visible(False) | ||||
|          | ||||
|         ax[row, col].set_title(f'Segment {index+1}\n', size = 24) | ||||
|          | ||||
|     fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32) | ||||
|     # plt.show() | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user