added functions documentation
This commit is contained in:
parent
f5b6075431
commit
68b68ed3da
|
@ -13,6 +13,17 @@ import io
|
|||
# functions
|
||||
|
||||
def load_train_test(type_of_activity):
|
||||
"""
|
||||
Loads the training and test datasets from S3 storage for the type of activity specified.
|
||||
|
||||
Args:
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
DataFrame: Training dataset.
|
||||
DataFrame: Test dataset.
|
||||
"""
|
||||
|
||||
# BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
|
||||
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
|
||||
File_path_train = BUCKET + "/Train_set.csv"
|
||||
|
@ -30,17 +41,23 @@ def load_train_test(type_of_activity):
|
|||
|
||||
|
||||
def features_target_split(dataset_train, dataset_test):
|
||||
"""
|
||||
Splits the dataset into features and target variables for training and testing.
|
||||
|
||||
Args:
|
||||
- dataset_train (DataFrame): Training dataset.
|
||||
- dataset_test (DataFrame): Test dataset.
|
||||
|
||||
Returns:
|
||||
DataFrame: Features of the training dataset.
|
||||
DataFrame: Features of the test dataset.
|
||||
DataFrame: Target variable of the training dataset.
|
||||
DataFrame: Target variable of the test dataset.
|
||||
"""
|
||||
|
||||
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
||||
'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
|
||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
||||
|
||||
# we suppress fidelity, time between purchase, and gender other (colinearity issue)
|
||||
"""
|
||||
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
|
||||
'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true',
|
||||
'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
|
||||
"""
|
||||
|
||||
X_train = dataset_train # [features_l]
|
||||
y_train = dataset_train[['y_has_purchased']]
|
||||
|
@ -52,6 +69,17 @@ def features_target_split(dataset_train, dataset_test):
|
|||
|
||||
|
||||
def load_model(type_of_activity, model):
|
||||
"""
|
||||
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
|
||||
|
||||
Args:
|
||||
- type_of_activity (str)
|
||||
- model (str)
|
||||
|
||||
Returns:
|
||||
Model: machine learning model pre-trained with a scikit learn pipeline.
|
||||
"""
|
||||
|
||||
# BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
||||
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
|
||||
filename = model + '.pkl'
|
||||
|
@ -64,6 +92,17 @@ def load_model(type_of_activity, model):
|
|||
|
||||
|
||||
def df_segment(df, y, model) :
|
||||
"""
|
||||
Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): DataFrame to be segmented.
|
||||
- y (Series): True target variable.
|
||||
- model (Model): Pre-trained machine learning model for prediction.
|
||||
|
||||
Returns:
|
||||
DataFrame: Segmented DataFrame with predicted values and true values for y.
|
||||
"""
|
||||
|
||||
y_pred = model.predict(df)
|
||||
y_pred_prob = model.predict_proba(df)[:, 1]
|
||||
|
@ -83,7 +122,7 @@ def df_segment(df, y, model) :
|
|||
def odd_ratio(score) :
|
||||
"""
|
||||
Args:
|
||||
- score (Union[float, int]): Score value.
|
||||
- score (Union[float, int])
|
||||
|
||||
Returns:
|
||||
float: Odd ratio value.
|
||||
|
@ -98,7 +137,7 @@ def adjust_score_1(score) :
|
|||
Allows to compute odd ratios then.
|
||||
|
||||
Args:
|
||||
- score (List[Union[float, int]]): List of score values.
|
||||
- score (List[Union[float, int]])
|
||||
|
||||
Returns:
|
||||
np.ndarray: Adjusted score values.
|
||||
|
@ -114,8 +153,8 @@ def adjusted_score(odd_ratio, bias) :
|
|||
Adjust the score based on the odd ratio and bias.
|
||||
|
||||
Args:
|
||||
- odd_ratio (Union[float, int]): Odd ratio value.
|
||||
- bias (Union[float, int]): Bias value.
|
||||
- odd_ratio (Union[float, int])
|
||||
- bias (Union[float, int])
|
||||
|
||||
Returns:
|
||||
float: Adjusted score value.
|
||||
|
@ -127,12 +166,12 @@ def adjusted_score(odd_ratio, bias) :
|
|||
|
||||
def find_bias(odd_ratios, y_objective, initial_guess=10) :
|
||||
"""
|
||||
Find the bias needed to adjust scores according to the purchases observed
|
||||
Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed.
|
||||
|
||||
Args:
|
||||
- odd_ratios (List[float]): List of odd ratios.
|
||||
- y_objective (Union[float, int]): Objective value to achieve.
|
||||
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6.
|
||||
- odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
|
||||
- y_objective (Union[float, int]): Objective value => total number of purchases.
|
||||
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
|
||||
|
||||
Returns:
|
||||
float: Estimated bias value.
|
||||
|
@ -168,22 +207,20 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
|
|||
|
||||
|
||||
def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
|
||||
|
||||
"""
|
||||
Project ticket counts and total amount for a given duration and adjust based on a score.
|
||||
Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): DataFrame containing ticket data.
|
||||
- df (DataFrame): DataFrame containing information about past sales.
|
||||
- nb_purchases (str) : Name of the column in df representing the number of purchases.
|
||||
- nb_tickets (str): Name of the column in df representing the number of tickets.
|
||||
- total_amount (str): Name of the column in df representing the total amount.
|
||||
- score_adjusted (str): Name of the column in df representing the adjusted score.
|
||||
- duration_ref (int or float): duration of the period of reference for the construction of the variables X.
|
||||
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
|
||||
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
|
||||
|
||||
Returns:
|
||||
DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
|
||||
duration_ratio = duration_ref/duration_projection
|
||||
DataFrame: DataFrame completed with sales and total amount projections.
|
||||
"""
|
||||
|
||||
duration_ratio = duration_ref/duration_projection
|
||||
|
@ -229,7 +266,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
|
|||
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
|
||||
duration_ref=17, duration_projection=12) :
|
||||
"""
|
||||
Generate a summary of expected customer acquisition based on segments.
|
||||
Generate a summary of expected customer sales based on segments.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): DataFrame containing customer data.
|
||||
|
@ -237,9 +274,12 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
|
|||
- nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
|
||||
- total_amount_expected (str): Name of the column in df representing the expected total amount.
|
||||
- total_amount (str): Name of the column in df representing the total amount.
|
||||
- pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
|
||||
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
|
||||
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
|
||||
|
||||
Returns:
|
||||
DataFrame: Summary DataFrame containing expected customer acquisition metrics.
|
||||
DataFrame: Summary DataFrame containing expected customer sales metrics.
|
||||
"""
|
||||
|
||||
# compute nb tickets estimated and total amount expected
|
||||
|
@ -267,6 +307,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
|
|||
|
||||
|
||||
def save_file_s3_ca(File_name, type_of_activity):
|
||||
"""
|
||||
Saves a file in S3 storage.
|
||||
|
||||
Args:
|
||||
- File_name (str)
|
||||
- type_of_activity (str)
|
||||
"""
|
||||
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png', dpi=120)
|
||||
image_buffer.seek(0)
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
### importations ###
|
||||
|
||||
|
||||
|
||||
|
||||
### functions for segmentation and graphics associated ###
|
||||
# functions for segmentation and graphics associated
|
||||
|
||||
def load_model(type_of_activity, model):
|
||||
"""
|
||||
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
|
||||
|
||||
Args:
|
||||
- type_of_activity (str)
|
||||
- model (str)
|
||||
|
||||
Returns:
|
||||
Model: machine learning model pre-trained with a scikit learn pipeline.
|
||||
"""
|
||||
|
||||
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
|
||||
filename = model + '.pkl'
|
||||
file_path = BUCKET + filename
|
||||
|
@ -17,6 +23,16 @@ def load_model(type_of_activity, model):
|
|||
|
||||
|
||||
def load_test_file(type_of_activity):
|
||||
"""
|
||||
Load the test dataset from S3 storage for the type of activity specified.
|
||||
|
||||
Args:
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
DataFrame: Test dataset.
|
||||
"""
|
||||
|
||||
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
|
||||
with fs.open(file_path_test, mode="rb") as file_in:
|
||||
dataset_test = pd.read_csv(file_in, sep=",")
|
||||
|
@ -24,6 +40,17 @@ def load_test_file(type_of_activity):
|
|||
|
||||
|
||||
def save_file_s3_mp(File_name, type_of_activity):
|
||||
"""
|
||||
Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified.
|
||||
|
||||
Args:
|
||||
- File_name (str)
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png', dpi=110)
|
||||
image_buffer.seek(0)
|
||||
|
@ -34,12 +61,36 @@ def save_file_s3_mp(File_name, type_of_activity):
|
|||
plt.close()
|
||||
|
||||
def save_txt_file_s3(file_name, type_of_activity, content):
|
||||
"""
|
||||
Save a text file to S3 storage to the location assigned for the type of activity specified.
|
||||
|
||||
Args:
|
||||
- file_name (str)
|
||||
- type_of_activity (str)
|
||||
- content (str)
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
|
||||
s3_file.write(content)
|
||||
|
||||
def df_business_fig(df, segment, list_var) :
|
||||
"""
|
||||
Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): The DataFrame containing data.
|
||||
- segment (str): The column name representing segments.
|
||||
- list_var (list of str): The list of variable names to be aggregated.
|
||||
|
||||
Returns:
|
||||
DataFrame: The DataFrame containing business KPIs.
|
||||
"""
|
||||
|
||||
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
|
||||
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
|
||||
all_var = ["size"] + list_var
|
||||
|
@ -49,6 +100,22 @@ def df_business_fig(df, segment, list_var) :
|
|||
|
||||
|
||||
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
|
||||
"""
|
||||
Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
|
||||
- segment (str): The column name representing segments.
|
||||
- size (str): The column name representing the size.
|
||||
- nb_tickets (str): The column name representing the number of tickets.
|
||||
- nb_purchases (str): The column name representing the number of purchases.
|
||||
- total_amount (str): The column name representing the total amount.
|
||||
- nb_campaigns (str): The column name representing the number of campaigns.
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
plt.figure()
|
||||
|
||||
|
@ -89,6 +156,18 @@ def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, tota
|
|||
|
||||
|
||||
def radar_mp_plot(df, categories, index) :
|
||||
"""
|
||||
Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
|
||||
|
||||
Args:
|
||||
- df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
|
||||
- categories (list of str):
|
||||
- index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
categories = categories
|
||||
|
||||
# true values are used to print the true value in parenthesis
|
||||
|
@ -118,6 +197,7 @@ def radar_mp_plot(df, categories, index) :
|
|||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
# if we don't plot this transparent line, the radius of the circle will be too small
|
||||
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
|
||||
|
||||
|
@ -138,6 +218,16 @@ def radar_mp_plot(df, categories, index) :
|
|||
|
||||
|
||||
def radar_mp_plot_all(df, type_of_activity) :
|
||||
"""
|
||||
Plot exactly the same radar charts as radar_mp_plot, but for all segments.
|
||||
|
||||
Args:
|
||||
- df (DataFrame)
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
# table summarizing variables relative to marketing personae
|
||||
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
|
||||
|
@ -190,6 +280,7 @@ def radar_mp_plot_all(df, type_of_activity) :
|
|||
|
||||
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||
# which is based on max(value)
|
||||
# if we don't plot this transparent line, the radius of the circle will be too small
|
||||
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
|
||||
linewidth=1.2)
|
||||
|
@ -215,7 +306,18 @@ def radar_mp_plot_all(df, type_of_activity) :
|
|||
plt.tight_layout()
|
||||
# plt.show()
|
||||
|
||||
|
||||
def known_sociodemo_caracteristics(df, type_of_activity) :
|
||||
"""
|
||||
Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
|
||||
|
||||
Args:
|
||||
- df (DataFrame)
|
||||
- type_of_activity (str)
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
|
||||
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
|
||||
|
|
Loading…
Reference in New Issue
Block a user