CA estimation by segment works well

2024-03-27 18:59:05 +00:00 · 2024-03-27 18:59:05 +00:00 · 10824e5e24
commit 10824e5e24
parent d3e13f4c56
3 changed files with 29 additions and 60 deletions
--- a/0_7_CA_segment.py
+++ b/0_7_CA_segment.py
@ -10,15 +10,20 @@ import pickle
 import warnings
 import io

-# importation of functions defined
-from utils_CA_segment import *
+
+# ignore warnings
+warnings.filterwarnings('ignore')

 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

+# importation of functions defined
+exec(open('utils_CA_segment.py').read())
+# from utils_CA_segment import *
+
 # define type of activity 
-type_of_activity = "sport"
+type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"

 # type of model for the score
@ -41,27 +46,23 @@ X_train_score = model.predict_proba(X_train)[:, 1]

 bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
                           y_objective = y_train["y_has_purchased"].sum(),
-                           initial_guess=6)
+                           initial_guess=10)
+print("Bias estimated :", np.log(bias_train_set))

 # create a score adjusted with the bias computed
 score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
 X_test_segment["score_adjusted"] = score_adjusted_train

+print("The score was successfully adjusted")
+MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
+MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
+print(f"MAE for score : {MAE_score}")
+print(f"MAE for adjusted score : {MAE_ajusted_score}")

 ### 1. plot adjusted scores and save (to be tested)
 plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
 save_file_s3_ca("hist_score_adjusted_", type_of_activity)

-"""
-image_buffer = io.BytesIO()
-plt.savefig(image_buffer, format='png')
-image_buffer.seek(0)
-file_name = "hist_score_adjusted"
-FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
-with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
-    s3_file.write(image_buffer.read())
-plt.close()
-"""

 ### 2. comparison between score and adjusted score
 X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
@ -75,11 +76,14 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:


 # project revenue
-X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12)
+X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted", 
+                                     duration_ref=17, duration_projection=12)


 ### 3. table summarizing projections (nb tickets, revenue)
-X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
+X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
+                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
+                    total_amount="total_amount", pace_purchase="pace_purchase"),2)

 # rename columns
 mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
--- a/Sport/Modelization/CA_segment_sport.ipynb
+++ b/Sport/Modelization/CA_segment_sport.ipynb
@ -2558,46 +2558,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 67,
   "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
-      "/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
-      "/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
-      "/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
-      "/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n"
-     ]
-    },
    {
     "data": {
      "text/html": [
@ -2994,7 +2958,7 @@
       "[151874 rows x 27 columns]"
      ]
     },
-     "execution_count": 60,
+     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -3041,7 +3005,7 @@
    "# generalization with a function\n",
    "\n",
    "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n",
-    "                       duration_ref=1.5, duration_projection=1) :\n",
+    "                       duration_ref=17, duration_projection=12) :\n",
    "    \n",
    "    # compute nb tickets estimated and total amount expected\n",
    "    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n",
@ -3066,7 +3030,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 68,
   "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
   "metadata": {},
   "outputs": [
@ -3159,7 +3123,7 @@
       "3                    75.38          11.48  "
      ]
     },
-     "execution_count": 65,
+     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/utils_CA_segment.py
+++ b/utils_CA_segment.py
@ -123,7 +123,7 @@ def adjusted_score(odd_ratio, bias) :
    return adjusted_score


-def find_bias(odd_ratios, y_objective, initial_guess=6) :
+def find_bias(odd_ratios, y_objective, initial_guess=10) :
    """
    Find the bias needed to adjust scores according to the purchases observed

@ -136,7 +136,7 @@ def find_bias(odd_ratios, y_objective, initial_guess=6) :
    float: Estimated bias value.
    """

-    bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)
+    bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
 
    return bias_estimated[0]
    
@ -198,7 +198,8 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
    return df_output
    

-def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :  
+def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
+                       duration_ref=17, duration_projection=12) :  
    """
    Generate a summary of expected customer acquisition based on segments.