fixed forecasting issues

2024-04-03 10:36:47 +00:00 · 2024-04-03 10:36:47 +00:00 · acf7621d9a
commit acf7621d9a
parent 14953b031a
3 changed files with 1435 additions and 750 deletions
--- a/7_Sales_Forecast.py
+++ b/7_Sales_Forecast.py
@ -69,7 +69,9 @@ save_file_s3_ca("hist_score_adjusted_", type_of_activity)
 X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
 X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})

+print("Table of scores :\n")
 print(X_test_table_adjusted_scores)
+print("\n")

 # save table
 file_name = "table_adjusted_score_"
@ -84,14 +86,24 @@ X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets


 ### 3. table summarizing projections (nb tickets, revenue)
+"""
 X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
                    total_amount="total_amount", pace_purchase="pace_purchase"),2)
+                    """
+
+X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
+                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
+                    total_amount="total_amount_corrected", pace_purchase="pace_purchase"),2)

 # rename columns
 mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
 X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)

+print("Summary of forecast :\n")
+print(X_test_expected_CA)
+print("\n")
+
 # save table
 file_name = "table_expected_CA_"
 FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
--- a/Sport/Modelization/CA_segment_sport.ipynb
+++ b/Sport/Modelization/CA_segment_sport.ipynb
--- a/utils_sales_forecast.py
+++ b/utils_sales_forecast.py
@ -167,7 +167,8 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
    # plt.show()


-def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : 
+def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
+
    """
    Project ticket counts and total amount for a given duration and adjust based on a score.

@ -184,13 +185,38 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
    DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
    duration_ratio = duration_ref/duration_projection
    """
-
+    
    duration_ratio = duration_ref/duration_projection

    df_output = df
+    
+    # project number of tickets : at least 1 ticket purchased if the customer purchased
+    df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets].apply(lambda x : max(1, x /duration_ratio))

-    df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets] / duration_ratio
-    df_output.loc[:,"total_amount_projected"] = df_output.loc[:,total_amount] / duration_ratio
+    # project amount : if the customer buys a ticket, we expect the amount to be at least the average price of tickets 
+    # for customers purchasing exactly one ticket
+    if df_output.loc[df_output[nb_tickets]==1].shape[0] > 0 :
+        avg_price = df_output.loc[df_output[nb_tickets]==1][total_amount].mean()
+    else :
+        avg_price = df_output[total_amount].mean()
+
+    # we compute the avg price of ticket for each customer
+    df_output["avg_ticket_price"] = df_output[total_amount]/df_output[nb_tickets]
+
+    # correct negatives total amounts
+    df_output.loc[:,"total_amount_corrected"] = np.where(df_output[total_amount] < 0, 
+                                                         avg_price * df_output[nb_tickets],
+                                                         df_output[total_amount])
+
+    df_output.loc[:,"total_amount_projected"] = np.where(
+        # if no ticket bought in the past, we take the average price
+        df_output[nb_tickets]==0, avg_price,
+        # if avg prices of tickets are negative, we recompute the expected amount based on the avg price of a ticket
+        # observed on the whole population
+        np.where(X_test_segment["avg_ticket_price"] < 0, avg_price * df_output.loc[:,"nb_tickets_projected"],
+        # else, the amount projected is the average price of tickets bought by the customer * nb tickets projected
+                 df_output["avg_ticket_price"] * df_output.loc[:,"nb_tickets_projected"])
+        )
    
    df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
    df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
@ -198,7 +224,7 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
    df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
    
    return df_output
-    
+

 def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
                       duration_ref=17, duration_projection=12) :  
@ -231,6 +257,9 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
    df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
    df.groupby(segment)[total_amount].sum().values

+    df_expected_CA["share_future_revenue_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
+    df[total_amount].sum()
+
    df_drop_null_pace = df.dropna(subset=[pace_purchase])
    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values