added functions documentation

2024-04-04 14:29:16 +00:00 · 2024-04-04 14:29:16 +00:00 · 68b68ed3da
commit 68b68ed3da
parent f5b6075431
2 changed files with 179 additions and 29 deletions
--- a/utils_sales_forecast.py
+++ b/utils_sales_forecast.py
@ -13,6 +13,17 @@ import io
 # functions

 def load_train_test(type_of_activity):
+    """
+    Loads the training and test datasets from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Training dataset.
+    DataFrame: Test dataset.
+    """
+    
    # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
    BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
@ -30,17 +41,23 @@ def load_train_test(type_of_activity):


 def features_target_split(dataset_train, dataset_test):
+    """
+    Splits the dataset into features and target variables for training and testing.
+
+    Args:
+    - dataset_train (DataFrame): Training dataset.
+    - dataset_test (DataFrame): Test dataset.
+
+    Returns:
+    DataFrame: Features of the training dataset.
+    DataFrame: Features of the test dataset.
+    DataFrame: Target variable of the training dataset.
+    DataFrame: Target variable of the test dataset.
+    """
    
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'fidelity',  'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
-
-    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
-    """
-    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
-                  'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet',  'is_email_true', 
-                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
-    """
    
    X_train = dataset_train # [features_l]
    y_train = dataset_train[['y_has_purchased']]
@ -52,6 +69,17 @@ def features_target_split(dataset_train, dataset_test):
    

 def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
    # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
    filename = model + '.pkl'
@ -64,6 +92,17 @@ def load_model(type_of_activity, model):
    

 def df_segment(df, y, model) :
+    """
+    Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
+
+    Args:
+    - df (DataFrame): DataFrame to be segmented.
+    - y (Series): True target variable.
+    - model (Model): Pre-trained machine learning model for prediction.
+
+    Returns:
+    DataFrame: Segmented DataFrame with predicted values and true values for y.
+    """

    y_pred = model.predict(df)
    y_pred_prob = model.predict_proba(df)[:, 1]
@ -83,7 +122,7 @@ def df_segment(df, y, model) :
 def odd_ratio(score) :
    """
    Args:
-    - score (Union[float, int]): Score value.
+    - score (Union[float, int])

    Returns:
    float: Odd ratio value.
@ -98,7 +137,7 @@ def adjust_score_1(score) :
    Allows to compute odd ratios then.

    Args:
-    - score (List[Union[float, int]]): List of score values.
+    - score (List[Union[float, int]])

    Returns:
    np.ndarray: Adjusted score values.
@ -114,8 +153,8 @@ def adjusted_score(odd_ratio, bias) :
    Adjust the score based on the odd ratio and bias.

    Args:
-    - odd_ratio (Union[float, int]): Odd ratio value.
-    - bias (Union[float, int]): Bias value.
+    - odd_ratio (Union[float, int])
+    - bias (Union[float, int])

    Returns:
    float: Adjusted score value.
@ -127,12 +166,12 @@ def adjusted_score(odd_ratio, bias) :

 def find_bias(odd_ratios, y_objective, initial_guess=10) :
    """
-    Find the bias needed to adjust scores according to the purchases observed
+    Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed. 

    Args:
-    - odd_ratios (List[float]): List of odd ratios.
-    - y_objective (Union[float, int]): Objective value to achieve.
-    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6.
+    - odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
+    - y_objective (Union[float, int]): Objective value => total number of purchases.
+    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)

    Returns:
    float: Estimated bias value.
@ -168,22 +207,20 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :


 def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
-
    """
-    Project ticket counts and total amount for a given duration and adjust based on a score.
+    Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection. 

    Args:
-    - df (DataFrame): DataFrame containing ticket data.
+    - df (DataFrame): DataFrame containing information about past sales.
    - nb_purchases (str) : Name of the column in df representing the number of purchases.
    - nb_tickets (str): Name of the column in df representing the number of tickets.
    - total_amount (str): Name of the column in df representing the total amount.
    - score_adjusted (str): Name of the column in df representing the adjusted score.
-    - duration_ref (int or float): duration of the period of reference for the construction of the variables X.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 

    Returns:
-    DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
-    duration_ratio = duration_ref/duration_projection
+    DataFrame: DataFrame completed with sales and total amount projections. 
    """
    
    duration_ratio = duration_ref/duration_projection
@ -229,7 +266,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
 def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
                       duration_ref=17, duration_projection=12) :  
    """
-    Generate a summary of expected customer acquisition based on segments.
+    Generate a summary of expected customer sales based on segments.

    Args:
    - df (DataFrame): DataFrame containing customer data.
@ -237,9 +274,12 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
    - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
    - total_amount_expected (str): Name of the column in df representing the expected total amount.
    - total_amount (str): Name of the column in df representing the total amount.
+    - pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
+    - duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
+    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 

    Returns:
-    DataFrame: Summary DataFrame containing expected customer acquisition metrics.
+    DataFrame: Summary DataFrame containing expected customer sales metrics.
    """
    
    # compute nb tickets estimated and total amount expected
@ -267,6 +307,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,


 def save_file_s3_ca(File_name, type_of_activity):
+    """
+    Saves a file in S3 storage.
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+    """
+    
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png', dpi=120)
    image_buffer.seek(0)
--- a/utils_segmentation.py
+++ b/utils_segmentation.py
@ -1,11 +1,17 @@
-### importations ###
-
-
-
-
-### functions for segmentation and graphics associated ###
+# functions for segmentation and graphics associated

 def load_model(type_of_activity, model):
+    """
+    Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
+
+    Args:
+    - type_of_activity (str)
+    - model (str)
+
+    Returns:
+    Model: machine learning model pre-trained with a scikit learn pipeline.
+    """
+    
    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
@ -17,6 +23,16 @@ def load_model(type_of_activity, model):


 def load_test_file(type_of_activity):
+    """
+    Load the test dataset from S3 storage for the type of activity specified.
+
+    Args:
+    - type_of_activity (str)
+
+    Returns:
+    DataFrame: Test dataset.
+    """
+    
    file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
    with fs.open(file_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
@ -24,6 +40,17 @@ def load_test_file(type_of_activity):


 def save_file_s3_mp(File_name, type_of_activity):
+    """
+    Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified. 
+
+    Args:
+    - File_name (str)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
+    
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png', dpi=110)
    image_buffer.seek(0)
@ -34,12 +61,36 @@ def save_file_s3_mp(File_name, type_of_activity):
    plt.close()

 def save_txt_file_s3(file_name, type_of_activity, content):
+    """
+    Save a text file to S3 storage to the location assigned for the type of activity specified.
+
+    Args:
+    - file_name (str)
+    - type_of_activity (str)
+    - content (str)
+
+    Returns:
+    None
+    """
+    
    FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
    FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
    with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
        s3_file.write(content)
        
 def df_business_fig(df, segment, list_var) :
+    """
+    Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data.
+    - segment (str): The column name representing segments.
+    - list_var (list of str): The list of variable names to be aggregated.
+
+    Returns:
+    DataFrame: The DataFrame containing business KPIs.
+    """
+    
    df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
    df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
    all_var = ["size"] + list_var
@ -49,6 +100,22 @@ def df_business_fig(df, segment, list_var) :


 def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
+    """
+    Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
+
+    Args:
+    - df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
+    - segment (str): The column name representing segments.
+    - size (str): The column name representing the size.
+    - nb_tickets (str): The column name representing the number of tickets.
+    - nb_purchases (str): The column name representing the number of purchases.
+    - total_amount (str): The column name representing the total amount.
+    - nb_campaigns (str): The column name representing the number of campaigns.
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
    
    plt.figure()

@ -89,6 +156,18 @@ def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, tota


 def radar_mp_plot(df, categories, index) :
+    """
+    Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
+
+    Args:
+    - df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
+    - categories (list of str):
+    - index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
+
+    Returns:
+    None
+    """
+    
    categories = categories

    # true values are used to print the true value in parenthesis
@ -118,6 +197,7 @@ def radar_mp_plot(df, categories, index) :
    
    # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
    # which is based on max(value)
+    # if we don't plot this transparent line, the radius of the circle will be too small
    ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
    ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
    
@ -138,6 +218,16 @@ def radar_mp_plot(df, categories, index) :


 def radar_mp_plot_all(df, type_of_activity) :
+    """
+    Plot exactly the same radar charts as radar_mp_plot, but for all segments. 
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
    
    # table summarizing variables relative to marketing personae
    df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
@ -190,6 +280,7 @@ def radar_mp_plot_all(df, type_of_activity) :
    
        # we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
        # which is based on max(value)
+        # if we don't plot this transparent line, the radius of the circle will be too small
        ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
        ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
                          linewidth=1.2)
@ -215,7 +306,18 @@ def radar_mp_plot_all(df, type_of_activity) :
    plt.tight_layout()
    # plt.show()

+
 def known_sociodemo_caracteristics(df, type_of_activity) :
+    """
+    Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
+
+    Args:
+    - df (DataFrame)
+    - type_of_activity (str)
+
+    Returns:
+    None
+    """
    
    table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
    table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']