diff --git a/0_7_CA_segment.py b/0_7_CA_segment.py index a446a26..f69ab53 100644 --- a/0_7_CA_segment.py +++ b/0_7_CA_segment.py @@ -10,15 +10,20 @@ import pickle import warnings import io -# importation of functions defined -from utils_CA_segment import * + +# ignore warnings +warnings.filterwarnings('ignore') # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) +# importation of functions defined +exec(open('utils_CA_segment.py').read()) +# from utils_CA_segment import * + # define type of activity -type_of_activity = "sport" +type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/" # type of model for the score @@ -41,27 +46,23 @@ X_train_score = model.predict_proba(X_train)[:, 1] bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), y_objective = y_train["y_has_purchased"].sum(), - initial_guess=6) + initial_guess=10) +print("Bias estimated :", np.log(bias_train_set)) # create a score adjusted with the bias computed score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) X_test_segment["score_adjusted"] = score_adjusted_train +print("The score was successfully adjusted") +MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean() +MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean() +print(f"MAE for score : {MAE_score}") +print(f"MAE for adjusted score : {MAE_ajusted_score}") ### 1. plot adjusted scores and save (to be tested) plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) save_file_s3_ca("hist_score_adjusted_", type_of_activity) -""" -image_buffer = io.BytesIO() -plt.savefig(image_buffer, format='png') -image_buffer.seek(0) -file_name = "hist_score_adjusted" -FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png" -with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: - s3_file.write(image_buffer.read()) -plt.close() -""" ### 2. comparison between score and adjusted score X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() @@ -75,11 +76,14 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: # project revenue -X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12) +X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted", + duration_ref=17, duration_projection=12) ### 3. table summarizing projections (nb tickets, revenue) -X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2) +X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", + nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", + total_amount="total_amount", pace_purchase="pace_purchase"),2) # rename columns mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns} diff --git a/Sport/Modelization/CA_segment_sport.ipynb b/Sport/Modelization/CA_segment_sport.ipynb index 47786a6..af23af7 100644 --- a/Sport/Modelization/CA_segment_sport.ipynb +++ b/Sport/Modelization/CA_segment_sport.ipynb @@ -2558,46 +2558,10 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 67, "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", - "/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", - "/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", - "/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", - "/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n" - ] - }, { "data": { "text/html": [ @@ -2994,7 +2958,7 @@ "[151874 rows x 27 columns]" ] }, - "execution_count": 60, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -3041,7 +3005,7 @@ "# generalization with a function\n", "\n", "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", - " duration_ref=1.5, duration_projection=1) :\n", + " duration_ref=17, duration_projection=12) :\n", " \n", " # compute nb tickets estimated and total amount expected\n", " df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", @@ -3066,7 +3030,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 68, "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", "metadata": {}, "outputs": [ @@ -3159,7 +3123,7 @@ "3 75.38 11.48 " ] }, - "execution_count": 65, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } diff --git a/utils_CA_segment.py b/utils_CA_segment.py index 4e20816..28b5d26 100644 --- a/utils_CA_segment.py +++ b/utils_CA_segment.py @@ -123,7 +123,7 @@ def adjusted_score(odd_ratio, bias) : return adjusted_score -def find_bias(odd_ratios, y_objective, initial_guess=6) : +def find_bias(odd_ratios, y_objective, initial_guess=10) : """ Find the bias needed to adjust scores according to the purchases observed @@ -136,7 +136,7 @@ def find_bias(odd_ratios, y_objective, initial_guess=6) : float: Estimated bias value. """ - bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6) + bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess) return bias_estimated[0] @@ -198,7 +198,8 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust return df_output -def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) : +def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase, + duration_ref=17, duration_projection=12) : """ Generate a summary of expected customer acquisition based on segments.