From 68b68ed3daa94bbadf60aebf0e7d31dd9771666e Mon Sep 17 00:00:00 2001
From: tpique-ensae <thomas.pique@ensae.fr>
Date: Thu, 4 Apr 2024 14:29:16 +0000
Subject: [PATCH] added functions documentation

---
 utils_sales_forecast.py |  94 +++++++++++++++++++++++++--------
 utils_segmentation.py   | 114 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 179 insertions(+), 29 deletions(-)

diff --git a/utils_sales_forecast.py b/utils_sales_forecast.py
index dd8d2c8..2b949cd 100644
--- a/utils_sales_forecast.py
+++ b/utils_sales_forecast.py
@@ -13,6 +13,17 @@ import io
 # functions
 
 def load_train_test(type_of_activity):
+    """
+    Loads the training and test datasets from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Training dataset.
+    DataFrame: Test dataset.
+    """
+    
     # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
     BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
     File_path_train = BUCKET + "/Train_set.csv"
@@ -30,17 +41,23 @@ def load_train_test(type_of_activity):
 
 
 def features_target_split(dataset_train, dataset_test):
+    """
+    Splits the dataset into features and target variables for training and testing.
+
+    Args:
+    - dataset_train (DataFrame): Training dataset.
+    - dataset_test (DataFrame): Test dataset.
+
+    Returns:
+    DataFrame: Features of the training dataset.
+    DataFrame: Features of the test dataset.
+    DataFrame: Target variable of the training dataset.
+    DataFrame: Target variable of the test dataset.
+    """
     
     features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
             'time_between_purchase', 'fidelity',  'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
             'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
-
-    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
-    """
-    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
-                  'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet',  'is_email_true', 
-                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
-    """
     
     X_train = dataset_train # [features_l]
     y_train = dataset_train[['y_has_purchased']]
@@ -52,6 +69,17 @@ def features_target_split(dataset_train, dataset_test):
     
 
 def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
     # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
     BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
     filename = model + '.pkl'
@@ -64,6 +92,17 @@ def load_model(type_of_activity, model):
     
 
 def df_segment(df, y, model) :
+    """
+    Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
+
+    Args:
+    - df (DataFrame): DataFrame to be segmented.
+    - y (Series): True target variable.
+    - model (Model): Pre-trained machine learning model for prediction.
+
+    Returns:
+    DataFrame: Segmented DataFrame with predicted values and true values for y.
+    """
 
     y_pred = model.predict(df)
     y_pred_prob = model.predict_proba(df)[:, 1]
@@ -83,7 +122,7 @@ def df_segment(df, y, model) :
 def odd_ratio(score) :
     """
     Args:
-    - score (Union[float, int]): Score value.
+    - score (Union[float, int])
 
     Returns:
     float: Odd ratio value.
@@ -98,7 +137,7 @@ def adjust_score_1(score) :
     Allows to compute odd ratios then.
 
     Args:
-    - score (List[Union[float, int]]): List of score values.
+    - score (List[Union[float, int]])
 
     Returns:
     np.ndarray: Adjusted score values.
@@ -114,8 +153,8 @@ def adjusted_score(odd_ratio, bias) :
     Adjust the score based on the odd ratio and bias.
 
     Args:
-    - odd_ratio (Union[float, int]): Odd ratio value.
-    - bias (Union[float, int]): Bias value.
+    - odd_ratio (Union[float, int])
+    - bias (Union[float, int])
 
     Returns:
     float: Adjusted score value.
@@ -127,12 +166,12 @@ def adjusted_score(odd_ratio, bias) :
 
 def find_bias(odd_ratios, y_objective, initial_guess=10) :
     """
-    Find the bias needed to adjust scores according to the purchases observed
+    Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed. 
 
     Args:
-    - odd_ratios (List[float]): List of odd ratios.
-    - y_objective (Union[float, int]): Objective value to achieve.
-    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6.
+    - odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
+    - y_objective (Union[float, int]): Objective value => total number of purchases.
+    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
 
     Returns:
     float: Estimated bias value.
@@ -168,22 +207,20 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
 
 
 def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
-
     """
-    Project ticket counts and total amount for a given duration and adjust based on a score.
+    Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection. 
 
     Args:
-    - df (DataFrame): DataFrame containing ticket data.
+    - df (DataFrame): DataFrame containing information about past sales.
     - nb_purchases (str) : Name of the column in df representing the number of purchases.
     - nb_tickets (str): Name of the column in df representing the number of tickets.
     - total_amount (str): Name of the column in df representing the total amount.
     - score_adjusted (str): Name of the column in df representing the adjusted score.
-    - duration_ref (int or float): duration of the period of reference for the construction of the variables X.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
     - duration_projection (int or float): Duration of the period of projection of sales / revenue. 
 
     Returns:
-    DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
-    duration_ratio = duration_ref/duration_projection
+    DataFrame: DataFrame completed with sales and total amount projections. 
     """
     
     duration_ratio = duration_ref/duration_projection
@@ -229,7 +266,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
 def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
                        duration_ref=17, duration_projection=12) :  
     """
-    Generate a summary of expected customer acquisition based on segments.
+    Generate a summary of expected customer sales based on segments.
 
     Args:
     - df (DataFrame): DataFrame containing customer data.
@@ -237,9 +274,12 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
     - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
     - total_amount_expected (str): Name of the column in df representing the expected total amount.
     - total_amount (str): Name of the column in df representing the total amount.
+    - pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
+    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 
 
     Returns:
-    DataFrame: Summary DataFrame containing expected customer acquisition metrics.
+    DataFrame: Summary DataFrame containing expected customer sales metrics.
     """
     
     # compute nb tickets estimated and total amount expected
@@ -267,6 +307,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
 
 
 def save_file_s3_ca(File_name, type_of_activity):
+    """
+    Saves a file in S3 storage.
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+    """
+    
     image_buffer = io.BytesIO()
     plt.savefig(image_buffer, format='png', dpi=120)
     image_buffer.seek(0)
diff --git a/utils_segmentation.py b/utils_segmentation.py
index fbad427..9a545be 100644
--- a/utils_segmentation.py
+++ b/utils_segmentation.py
@@ -1,11 +1,17 @@
-### importations ###
-
-
-
-
-### functions for segmentation and graphics associated ###
+# functions for segmentation and graphics associated
 
 def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
     BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
     filename = model + '.pkl'
     file_path = BUCKET + filename
@@ -17,6 +23,16 @@ def load_model(type_of_activity, model):
 
 
 def load_test_file(type_of_activity):
+    """
+    Load the test dataset from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Test dataset.
+    """
+    
     file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
     with fs.open(file_path_test, mode="rb") as file_in:
         dataset_test = pd.read_csv(file_in, sep=",")
@@ -24,6 +40,17 @@ def load_test_file(type_of_activity):
 
 
 def save_file_s3_mp(File_name, type_of_activity):
+    """
+    Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified. 
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
     image_buffer = io.BytesIO()
     plt.savefig(image_buffer, format='png', dpi=110)
     image_buffer.seek(0)
@@ -34,12 +61,36 @@ def save_file_s3_mp(File_name, type_of_activity):
     plt.close()
 
 def save_txt_file_s3(file_name, type_of_activity, content):
+    """
+    Save a text file to S3 storage to the location assigned for the type of activity specified.
+
+    Args:
+    - file_name (str)
+    - type_of_activity (str)
+    - content (str)
+
+    Returns:
+    None
+    """
+    
     FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
     FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
     with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
         s3_file.write(content)
         
 def df_business_fig(df, segment, list_var) :
+    """
+    Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data.
+    - segment (str): The column name representing segments.
+    - list_var (list of str): The list of variable names to be aggregated.
+
+    Returns:
+    DataFrame: The DataFrame containing business KPIs.
+    """
+    
     df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
     df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
     all_var = ["size"] + list_var
@@ -49,6 +100,22 @@ def df_business_fig(df, segment, list_var) :
 
 
 def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
+    """
+    Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
+
+    Args:
+    - df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
+    - segment (str): The column name representing segments.
+    - size (str): The column name representing the size.
+    - nb_tickets (str): The column name representing the number of tickets.
+    - nb_purchases (str): The column name representing the number of purchases.
+    - total_amount (str): The column name representing the total amount.
+    - nb_campaigns (str): The column name representing the number of campaigns.
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
     
     plt.figure()
 
@@ -89,6 +156,18 @@ def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, tota
 
 
 def radar_mp_plot(df, categories, index) :
+    """
+    Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
+    - categories (list of str):
+    - index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
+
+    Returns:
+    None
+    """
+    
     categories = categories
 
     # true values are used to print the true value in parenthesis
@@ -118,6 +197,7 @@ def radar_mp_plot(df, categories, index) :
     
     # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
     # which is based on max(value)
+    # if we don't plot this transparent line, the radius of the circle will be too small
     ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
     ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
     
@@ -138,6 +218,16 @@ def radar_mp_plot(df, categories, index) :
 
 
 def radar_mp_plot_all(df, type_of_activity) :
+    """
+    Plot exactly the same radar charts as radar_mp_plot, but for all segments. 
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
     
     # table summarizing variables relative to marketing personae
     df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
@@ -190,6 +280,7 @@ def radar_mp_plot_all(df, type_of_activity) :
     
         # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
         # which is based on max(value)
+        # if we don't plot this transparent line, the radius of the circle will be too small
         ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
         ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
                           linewidth=1.2)
@@ -215,7 +306,18 @@ def radar_mp_plot_all(df, type_of_activity) :
     plt.tight_layout()
     # plt.show()
 
+
 def known_sociodemo_caracteristics(df, type_of_activity) :
+    """
+    Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
     
     table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
     table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']