# importations import pandas as pd from pandas import DataFrame import numpy as np import os import s3fs import matplotlib.pyplot as plt from scipy.optimize import fsolve import pickle import warnings import io # functions def load_train_test(type_of_activity): # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}" BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}" File_path_train = BUCKET + "/Train_set.csv" File_path_test = BUCKET + "/Test_set.csv" with fs.open( File_path_train, mode="rb") as file_in: dataset_train = pd.read_csv(file_in, sep=",") # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0) with fs.open(File_path_test, mode="rb") as file_in: dataset_test = pd.read_csv(file_in, sep=",") # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0) return dataset_train, dataset_test def features_target_split(dataset_train, dataset_test): features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet', 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] # we suppress fidelity, time between purchase, and gender other (colinearity issue) """ features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened'] """ X_train = dataset_train # [features_l] y_train = dataset_train[['y_has_purchased']] X_test = dataset_test # [features_l] y_test = dataset_test[['y_has_purchased']] return X_train, X_test, y_train, y_test def load_model(type_of_activity, model): # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/" filename = model + '.pkl' file_path = BUCKET + filename with fs.open(file_path, mode="rb") as f: model_bytes = f.read() model = pickle.loads(model_bytes) return model def df_segment(df, y, model) : y_pred = model.predict(df) y_pred_prob = model.predict_proba(df)[:, 1] df_segment = df df_segment["has_purchased"] = y df_segment["has_purchased_estim"] = y_pred df_segment["score"] = y_pred_prob df_segment["quartile"] = np.where(df_segment['score']<0.25, '1', np.where(df_segment['score']<0.5, '2', np.where(df_segment['score']<0.75, '3', '4'))) return df_segment def odd_ratio(score) : """ Args: - score (Union[float, int]): Score value. Returns: float: Odd ratio value. """ return score / (1 - score) def adjust_score_1(score) : """ Adjust scores by replacing ones with the second highest value. Allows to compute odd ratios then. Args: - score (List[Union[float, int]]): List of score values. Returns: np.ndarray: Adjusted score values. """ second_best_score = np.array([element for element in score if element !=1]).max() new_score = np.array([element if element!=1 else second_best_score for element in score]) return new_score def adjusted_score(odd_ratio, bias) : """ Adjust the score based on the odd ratio and bias. Args: - odd_ratio (Union[float, int]): Odd ratio value. - bias (Union[float, int]): Bias value. Returns: float: Adjusted score value. """ adjusted_score = odd_ratio/(bias+odd_ratio) return adjusted_score def find_bias(odd_ratios, y_objective, initial_guess=10) : """ Find the bias needed to adjust scores according to the purchases observed Args: - odd_ratios (List[float]): List of odd ratios. - y_objective (Union[float, int]): Objective value to achieve. - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6. Returns: float: Estimated bias value. """ bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess) return bias_estimated[0] def plot_hist_scores(df, score, score_adjusted, type_of_activity) : """ Plot a histogram comparing scores and adjusted scores. Args: - df (DataFrame): DataFrame containing score data. - score (str): Name of the column in df representing the original scores. - score_adjusted (str): Name of the column in df representing the adjusted scores. - type_of_activity (str) : type of activity of the companies considered. Returns: None """ plt.figure() plt.hist(df[score], label = "score", alpha=0.6) plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6) plt.legend() plt.xlabel("probability of a future purchase") plt.ylabel("count") plt.title(f"Comparison between score and adjusted score for {type_of_activity} companies") # plt.show() def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : """ Project ticket counts and total amount for a given duration and adjust based on a score. Args: - df (DataFrame): DataFrame containing ticket data. - nb_purchases (str) : Name of the column in df representing the number of purchases. - nb_tickets (str): Name of the column in df representing the number of tickets. - total_amount (str): Name of the column in df representing the total amount. - score_adjusted (str): Name of the column in df representing the adjusted score. - duration_ref (int or float): duration of the period of reference for the construction of the variables X. - duration_projection (int or float): Duration of the period of projection of sales / revenue. Returns: DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score. duration_ratio = duration_ref/duration_projection """ duration_ratio = duration_ref/duration_projection df_output = df df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets] / duration_ratio df_output.loc[:,"total_amount_projected"] = df_output.loc[:,total_amount] / duration_ratio df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"] df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"] df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x) return df_output def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase, duration_ref=17, duration_projection=12) : """ Generate a summary of expected customer acquisition based on segments. Args: - df (DataFrame): DataFrame containing customer data. - segment (str): Name of the column in df representing customer segments. - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets. - total_amount_expected (str): Name of the column in df representing the expected total amount. - total_amount (str): Name of the column in df representing the total amount. Returns: DataFrame: Summary DataFrame containing expected customer acquisition metrics. """ # compute nb tickets estimated and total amount expected df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index() # number of customers by segment df_expected_CA.insert(1, "size", df.groupby(segment).size().values) # size in percent of all customers df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum()) # compute share of CA recovered duration_ratio=duration_ref/duration_projection df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \ df.groupby(segment)[total_amount].sum().values df_drop_null_pace = df.dropna(subset=[pace_purchase]) df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values return df_expected_CA def save_file_s3_ca(File_name, type_of_activity): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png', dpi=120) image_buffer.seek(0) PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/" FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close()