added functions documentation

This commit is contained in:
Thomas PIQUE 2024-04-04 14:29:16 +00:00
parent f5b6075431
commit 68b68ed3da
2 changed files with 179 additions and 29 deletions

View File

@ -13,6 +13,17 @@ import io
# functions
def load_train_test(type_of_activity):
"""
Loads the training and test datasets from S3 storage for the type of activity specified.
Args:
- type_of_activity (str)
Returns:
DataFrame: Training dataset.
DataFrame: Test dataset.
"""
# BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
@ -30,17 +41,23 @@ def load_train_test(type_of_activity):
def features_target_split(dataset_train, dataset_test):
"""
Splits the dataset into features and target variables for training and testing.
Args:
- dataset_train (DataFrame): Training dataset.
- dataset_test (DataFrame): Test dataset.
Returns:
DataFrame: Features of the training dataset.
DataFrame: Features of the test dataset.
DataFrame: Target variable of the training dataset.
DataFrame: Target variable of the test dataset.
"""
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
# we suppress fidelity, time between purchase, and gender other (colinearity issue)
"""
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true',
'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
"""
X_train = dataset_train # [features_l]
y_train = dataset_train[['y_has_purchased']]
@ -52,6 +69,17 @@ def features_target_split(dataset_train, dataset_test):
def load_model(type_of_activity, model):
"""
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
Args:
- type_of_activity (str)
- model (str)
Returns:
Model: machine learning model pre-trained with a scikit learn pipeline.
"""
# BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl'
@ -64,6 +92,17 @@ def load_model(type_of_activity, model):
def df_segment(df, y, model) :
"""
Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
Args:
- df (DataFrame): DataFrame to be segmented.
- y (Series): True target variable.
- model (Model): Pre-trained machine learning model for prediction.
Returns:
DataFrame: Segmented DataFrame with predicted values and true values for y.
"""
y_pred = model.predict(df)
y_pred_prob = model.predict_proba(df)[:, 1]
@ -83,7 +122,7 @@ def df_segment(df, y, model) :
def odd_ratio(score) :
"""
Args:
- score (Union[float, int]): Score value.
- score (Union[float, int])
Returns:
float: Odd ratio value.
@ -98,7 +137,7 @@ def adjust_score_1(score) :
Allows to compute odd ratios then.
Args:
- score (List[Union[float, int]]): List of score values.
- score (List[Union[float, int]])
Returns:
np.ndarray: Adjusted score values.
@ -114,8 +153,8 @@ def adjusted_score(odd_ratio, bias) :
Adjust the score based on the odd ratio and bias.
Args:
- odd_ratio (Union[float, int]): Odd ratio value.
- bias (Union[float, int]): Bias value.
- odd_ratio (Union[float, int])
- bias (Union[float, int])
Returns:
float: Adjusted score value.
@ -127,12 +166,12 @@ def adjusted_score(odd_ratio, bias) :
def find_bias(odd_ratios, y_objective, initial_guess=10) :
"""
Find the bias needed to adjust scores according to the purchases observed
Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed.
Args:
- odd_ratios (List[float]): List of odd ratios.
- y_objective (Union[float, int]): Objective value to achieve.
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6.
- odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
- y_objective (Union[float, int]): Objective value => total number of purchases.
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
Returns:
float: Estimated bias value.
@ -168,22 +207,20 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
"""
Project ticket counts and total amount for a given duration and adjust based on a score.
Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection.
Args:
- df (DataFrame): DataFrame containing ticket data.
- df (DataFrame): DataFrame containing information about past sales.
- nb_purchases (str) : Name of the column in df representing the number of purchases.
- nb_tickets (str): Name of the column in df representing the number of tickets.
- total_amount (str): Name of the column in df representing the total amount.
- score_adjusted (str): Name of the column in df representing the adjusted score.
- duration_ref (int or float): duration of the period of reference for the construction of the variables X.
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
Returns:
DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
duration_ratio = duration_ref/duration_projection
DataFrame: DataFrame completed with sales and total amount projections.
"""
duration_ratio = duration_ref/duration_projection
@ -229,7 +266,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
duration_ref=17, duration_projection=12) :
"""
Generate a summary of expected customer acquisition based on segments.
Generate a summary of expected customer sales based on segments.
Args:
- df (DataFrame): DataFrame containing customer data.
@ -237,9 +274,12 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
- nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
- total_amount_expected (str): Name of the column in df representing the expected total amount.
- total_amount (str): Name of the column in df representing the total amount.
- pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
Returns:
DataFrame: Summary DataFrame containing expected customer acquisition metrics.
DataFrame: Summary DataFrame containing expected customer sales metrics.
"""
# compute nb tickets estimated and total amount expected
@ -267,6 +307,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
def save_file_s3_ca(File_name, type_of_activity):
"""
Saves a file in S3 storage.
Args:
- File_name (str)
- type_of_activity (str)
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=120)
image_buffer.seek(0)

View File

@ -1,11 +1,17 @@
### importations ###
### functions for segmentation and graphics associated ###
# functions for segmentation and graphics associated
def load_model(type_of_activity, model):
"""
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
Args:
- type_of_activity (str)
- model (str)
Returns:
Model: machine learning model pre-trained with a scikit learn pipeline.
"""
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
@ -17,6 +23,16 @@ def load_model(type_of_activity, model):
def load_test_file(type_of_activity):
"""
Load the test dataset from S3 storage for the type of activity specified.
Args:
- type_of_activity (str)
Returns:
DataFrame: Test dataset.
"""
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
@ -24,6 +40,17 @@ def load_test_file(type_of_activity):
def save_file_s3_mp(File_name, type_of_activity):
"""
Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified.
Args:
- File_name (str)
- type_of_activity (str)
Returns:
None
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=110)
image_buffer.seek(0)
@ -34,12 +61,36 @@ def save_file_s3_mp(File_name, type_of_activity):
plt.close()
def save_txt_file_s3(file_name, type_of_activity, content):
"""
Save a text file to S3 storage to the location assigned for the type of activity specified.
Args:
- file_name (str)
- type_of_activity (str)
- content (str)
Returns:
None
"""
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
s3_file.write(content)
def df_business_fig(df, segment, list_var) :
"""
Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
Args:
- df (DataFrame): The DataFrame containing data.
- segment (str): The column name representing segments.
- list_var (list of str): The list of variable names to be aggregated.
Returns:
DataFrame: The DataFrame containing business KPIs.
"""
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
@ -49,6 +100,22 @@ def df_business_fig(df, segment, list_var) :
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
"""
Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
Args:
- df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
- segment (str): The column name representing segments.
- size (str): The column name representing the size.
- nb_tickets (str): The column name representing the number of tickets.
- nb_purchases (str): The column name representing the number of purchases.
- total_amount (str): The column name representing the total amount.
- nb_campaigns (str): The column name representing the number of campaigns.
- type_of_activity (str)
Returns:
None
"""
plt.figure()
@ -89,6 +156,18 @@ def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, tota
def radar_mp_plot(df, categories, index) :
"""
Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
Args:
- df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
- categories (list of str):
- index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
Returns:
None
"""
categories = categories
# true values are used to print the true value in parenthesis
@ -118,6 +197,7 @@ def radar_mp_plot(df, categories, index) :
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
@ -138,6 +218,16 @@ def radar_mp_plot(df, categories, index) :
def radar_mp_plot_all(df, type_of_activity) :
"""
Plot exactly the same radar charts as radar_mp_plot, but for all segments.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
# table summarizing variables relative to marketing personae
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
@ -190,6 +280,7 @@ def radar_mp_plot_all(df, type_of_activity) :
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
@ -215,7 +306,18 @@ def radar_mp_plot_all(df, type_of_activity) :
plt.tight_layout()
# plt.show()
def known_sociodemo_caracteristics(df, type_of_activity) :
"""
Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']