From 68b68ed3daa94bbadf60aebf0e7d31dd9771666e Mon Sep 17 00:00:00 2001 From: tpique-ensae Date: Thu, 4 Apr 2024 14:29:16 +0000 Subject: [PATCH] added functions documentation --- utils_sales_forecast.py | 94 +++++++++++++++++++++++++-------- utils_segmentation.py | 114 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 179 insertions(+), 29 deletions(-) diff --git a/utils_sales_forecast.py b/utils_sales_forecast.py index dd8d2c8..2b949cd 100644 --- a/utils_sales_forecast.py +++ b/utils_sales_forecast.py @@ -13,6 +13,17 @@ import io # functions def load_train_test(type_of_activity): + """ + Loads the training and test datasets from S3 storage for the type of activity specified. + + Args: + - type_of_activity (str) + + Returns: + DataFrame: Training dataset. + DataFrame: Test dataset. + """ + # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}" BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}" File_path_train = BUCKET + "/Train_set.csv" @@ -30,17 +41,23 @@ def load_train_test(type_of_activity): def features_target_split(dataset_train, dataset_test): + """ + Splits the dataset into features and target variables for training and testing. + + Args: + - dataset_train (DataFrame): Training dataset. + - dataset_test (DataFrame): Test dataset. + + Returns: + DataFrame: Features of the training dataset. + DataFrame: Features of the test dataset. + DataFrame: Target variable of the training dataset. + DataFrame: Target variable of the test dataset. + """ features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet', 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] - - # we suppress fidelity, time between purchase, and gender other (colinearity issue) - """ - features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', - 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', - 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened'] - """ X_train = dataset_train # [features_l] y_train = dataset_train[['y_has_purchased']] @@ -52,6 +69,17 @@ def features_target_split(dataset_train, dataset_test): def load_model(type_of_activity, model): + """ + Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file. + + Args: + - type_of_activity (str) + - model (str) + + Returns: + Model: machine learning model pre-trained with a scikit learn pipeline. + """ + # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/" filename = model + '.pkl' @@ -64,6 +92,17 @@ def load_model(type_of_activity, model): def df_segment(df, y, model) : + """ + Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model. + + Args: + - df (DataFrame): DataFrame to be segmented. + - y (Series): True target variable. + - model (Model): Pre-trained machine learning model for prediction. + + Returns: + DataFrame: Segmented DataFrame with predicted values and true values for y. + """ y_pred = model.predict(df) y_pred_prob = model.predict_proba(df)[:, 1] @@ -83,7 +122,7 @@ def df_segment(df, y, model) : def odd_ratio(score) : """ Args: - - score (Union[float, int]): Score value. + - score (Union[float, int]) Returns: float: Odd ratio value. @@ -98,7 +137,7 @@ def adjust_score_1(score) : Allows to compute odd ratios then. Args: - - score (List[Union[float, int]]): List of score values. + - score (List[Union[float, int]]) Returns: np.ndarray: Adjusted score values. @@ -114,8 +153,8 @@ def adjusted_score(odd_ratio, bias) : Adjust the score based on the odd ratio and bias. Args: - - odd_ratio (Union[float, int]): Odd ratio value. - - bias (Union[float, int]): Bias value. + - odd_ratio (Union[float, int]) + - bias (Union[float, int]) Returns: float: Adjusted score value. @@ -127,12 +166,12 @@ def adjusted_score(odd_ratio, bias) : def find_bias(odd_ratios, y_objective, initial_guess=10) : """ - Find the bias needed to adjust scores according to the purchases observed + Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed. Args: - - odd_ratios (List[float]): List of odd ratios. - - y_objective (Union[float, int]): Objective value to achieve. - - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6. + - odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted. + - y_objective (Union[float, int]): Objective value => total number of purchases. + - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums) Returns: float: Estimated bias value. @@ -168,22 +207,20 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) : def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : - """ - Project ticket counts and total amount for a given duration and adjust based on a score. + Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection. Args: - - df (DataFrame): DataFrame containing ticket data. + - df (DataFrame): DataFrame containing information about past sales. - nb_purchases (str) : Name of the column in df representing the number of purchases. - nb_tickets (str): Name of the column in df representing the number of tickets. - total_amount (str): Name of the column in df representing the total amount. - score_adjusted (str): Name of the column in df representing the adjusted score. - - duration_ref (int or float): duration of the period of reference for the construction of the variables X. + - duration_ref (int or float): Duration of the period of reference for the construction of the variables X. - duration_projection (int or float): Duration of the period of projection of sales / revenue. Returns: - DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score. - duration_ratio = duration_ref/duration_projection + DataFrame: DataFrame completed with sales and total amount projections. """ duration_ratio = duration_ref/duration_projection @@ -229,7 +266,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase, duration_ref=17, duration_projection=12) : """ - Generate a summary of expected customer acquisition based on segments. + Generate a summary of expected customer sales based on segments. Args: - df (DataFrame): DataFrame containing customer data. @@ -237,9 +274,12 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets. - total_amount_expected (str): Name of the column in df representing the expected total amount. - total_amount (str): Name of the column in df representing the total amount. + - pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months. + - duration_ref (int or float): Duration of the period of reference for the construction of the variables X. + - duration_projection (int or float): Duration of the period of projection of sales / revenue. Returns: - DataFrame: Summary DataFrame containing expected customer acquisition metrics. + DataFrame: Summary DataFrame containing expected customer sales metrics. """ # compute nb tickets estimated and total amount expected @@ -267,6 +307,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, def save_file_s3_ca(File_name, type_of_activity): + """ + Saves a file in S3 storage. + + Args: + - File_name (str) + - type_of_activity (str) + """ + image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png', dpi=120) image_buffer.seek(0) diff --git a/utils_segmentation.py b/utils_segmentation.py index fbad427..9a545be 100644 --- a/utils_segmentation.py +++ b/utils_segmentation.py @@ -1,11 +1,17 @@ -### importations ### - - - - -### functions for segmentation and graphics associated ### +# functions for segmentation and graphics associated def load_model(type_of_activity, model): + """ + Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file. + + Args: + - type_of_activity (str) + - model (str) + + Returns: + Model: machine learning model pre-trained with a scikit learn pipeline. + """ + BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/" filename = model + '.pkl' file_path = BUCKET + filename @@ -17,6 +23,16 @@ def load_model(type_of_activity, model): def load_test_file(type_of_activity): + """ + Load the test dataset from S3 storage for the type of activity specified. + + Args: + - type_of_activity (str) + + Returns: + DataFrame: Test dataset. + """ + file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv" with fs.open(file_path_test, mode="rb") as file_in: dataset_test = pd.read_csv(file_in, sep=",") @@ -24,6 +40,17 @@ def load_test_file(type_of_activity): def save_file_s3_mp(File_name, type_of_activity): + """ + Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified. + + Args: + - File_name (str) + - type_of_activity (str) + + Returns: + None + """ + image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png', dpi=110) image_buffer.seek(0) @@ -34,12 +61,36 @@ def save_file_s3_mp(File_name, type_of_activity): plt.close() def save_txt_file_s3(file_name, type_of_activity, content): + """ + Save a text file to S3 storage to the location assigned for the type of activity specified. + + Args: + - file_name (str) + - type_of_activity (str) + - content (str) + + Returns: + None + """ + FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/" FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt' with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file: s3_file.write(content) def df_business_fig(df, segment, list_var) : + """ + Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables. + + Args: + - df (DataFrame): The DataFrame containing data. + - segment (str): The column name representing segments. + - list_var (list of str): The list of variable names to be aggregated. + + Returns: + DataFrame: The DataFrame containing business KPIs. + """ + df_business_kpi = df.groupby(segment)[list_var].sum().reset_index() df_business_kpi.insert(1, "size", df.groupby(segment).size().values) all_var = ["size"] + list_var @@ -49,6 +100,22 @@ def df_business_fig(df, segment, list_var) : def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) : + """ + Plot a histogram stacking the relative weight of each segment regarding some key business indicators. + + Args: + - df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators + - segment (str): The column name representing segments. + - size (str): The column name representing the size. + - nb_tickets (str): The column name representing the number of tickets. + - nb_purchases (str): The column name representing the number of purchases. + - total_amount (str): The column name representing the total amount. + - nb_campaigns (str): The column name representing the number of campaigns. + - type_of_activity (str) + + Returns: + None + """ plt.figure() @@ -89,6 +156,18 @@ def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, tota def radar_mp_plot(df, categories, index) : + """ + Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified. + + Args: + - df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment + - categories (list of str): + - index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1 + + Returns: + None + """ + categories = categories # true values are used to print the true value in parenthesis @@ -118,6 +197,7 @@ def radar_mp_plot(df, categories, index) : # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle # which is based on max(value) + # if we don't plot this transparent line, the radius of the circle will be too small ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2) @@ -138,6 +218,16 @@ def radar_mp_plot(df, categories, index) : def radar_mp_plot_all(df, type_of_activity) : + """ + Plot exactly the same radar charts as radar_mp_plot, but for all segments. + + Args: + - df (DataFrame) + - type_of_activity (str) + + Returns: + None + """ # table summarizing variables relative to marketing personae df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index() @@ -190,6 +280,7 @@ def radar_mp_plot_all(df, type_of_activity) : # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle # which is based on max(value) + # if we don't plot this transparent line, the radius of the circle will be too small ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5) ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2) @@ -215,7 +306,18 @@ def radar_mp_plot_all(df, type_of_activity) : plt.tight_layout() # plt.show() + def known_sociodemo_caracteristics(df, type_of_activity) : + """ + Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table. + + Args: + - df (DataFrame) + - type_of_activity (str) + + Returns: + None + """ table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index() table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']