CA estimation by segment works well

This commit is contained in:
Thomas PIQUE 2024-03-27 18:59:05 +00:00
parent d3e13f4c56
commit 10824e5e24
3 changed files with 29 additions and 60 deletions

View File

@ -10,15 +10,20 @@ import pickle
import warnings
import io
# importation of functions defined
from utils_CA_segment import *
# ignore warnings
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# importation of functions defined
exec(open('utils_CA_segment.py').read())
# from utils_CA_segment import *
# define type of activity
type_of_activity = "sport"
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
# type of model for the score
@ -41,27 +46,23 @@ X_train_score = model.predict_proba(X_train)[:, 1]
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
y_objective = y_train["y_has_purchased"].sum(),
initial_guess=6)
initial_guess=10)
print("Bias estimated :", np.log(bias_train_set))
# create a score adjusted with the bias computed
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train
print("The score was successfully adjusted")
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
print(f"MAE for score : {MAE_score}")
print(f"MAE for adjusted score : {MAE_ajusted_score}")
### 1. plot adjusted scores and save (to be tested)
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
file_name = "hist_score_adjusted"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
"""
### 2. comparison between score and adjusted score
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
@ -75,11 +76,14 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
# project revenue
X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12)
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",
duration_ref=17, duration_projection=12)
### 3. table summarizing projections (nb tickets, revenue)
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
total_amount="total_amount", pace_purchase="pace_purchase"),2)
# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}

View File

@ -2558,46 +2558,10 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 67,
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
"/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
"/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
"/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n"
]
},
{
"data": {
"text/html": [
@ -2994,7 +2958,7 @@
"[151874 rows x 27 columns]"
]
},
"execution_count": 60,
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
@ -3041,7 +3005,7 @@
"# generalization with a function\n",
"\n",
"def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n",
" duration_ref=1.5, duration_projection=1) :\n",
" duration_ref=17, duration_projection=12) :\n",
" \n",
" # compute nb tickets estimated and total amount expected\n",
" df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n",
@ -3066,7 +3030,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 68,
"id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
"metadata": {},
"outputs": [
@ -3159,7 +3123,7 @@
"3 75.38 11.48 "
]
},
"execution_count": 65,
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}

View File

@ -123,7 +123,7 @@ def adjusted_score(odd_ratio, bias) :
return adjusted_score
def find_bias(odd_ratios, y_objective, initial_guess=6) :
def find_bias(odd_ratios, y_objective, initial_guess=10) :
"""
Find the bias needed to adjust scores according to the purchases observed
@ -136,7 +136,7 @@ def find_bias(odd_ratios, y_objective, initial_guess=6) :
float: Estimated bias value.
"""
bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)
bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
return bias_estimated[0]
@ -198,7 +198,8 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust
return df_output
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
duration_ref=17, duration_projection=12) :
"""
Generate a summary of expected customer acquisition based on segments.