CA estimation by segment works well
This commit is contained in:
		
							parent
							
								
									d3e13f4c56
								
							
						
					
					
						commit
						10824e5e24
					
				|  | @ -10,15 +10,20 @@ import pickle | ||||||
| import warnings | import warnings | ||||||
| import io | import io | ||||||
| 
 | 
 | ||||||
| # importation of functions defined | 
 | ||||||
| from utils_CA_segment import * | # ignore warnings | ||||||
|  | warnings.filterwarnings('ignore') | ||||||
| 
 | 
 | ||||||
| # Create filesystem object | # Create filesystem object | ||||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||||
| fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | ||||||
| 
 | 
 | ||||||
|  | # importation of functions defined | ||||||
|  | exec(open('utils_CA_segment.py').read()) | ||||||
|  | # from utils_CA_segment import * | ||||||
|  | 
 | ||||||
| # define type of activity  | # define type of activity  | ||||||
| type_of_activity = "sport" | type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||||
| PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/" | PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/" | ||||||
| 
 | 
 | ||||||
| # type of model for the score | # type of model for the score | ||||||
|  | @ -41,27 +46,23 @@ X_train_score = model.predict_proba(X_train)[:, 1] | ||||||
| 
 | 
 | ||||||
| bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),  | bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),  | ||||||
|                            y_objective = y_train["y_has_purchased"].sum(), |                            y_objective = y_train["y_has_purchased"].sum(), | ||||||
|                            initial_guess=6) |                            initial_guess=10) | ||||||
|  | print("Bias estimated :", np.log(bias_train_set)) | ||||||
| 
 | 
 | ||||||
| # create a score adjusted with the bias computed | # create a score adjusted with the bias computed | ||||||
| score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) | score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) | ||||||
| X_test_segment["score_adjusted"] = score_adjusted_train | X_test_segment["score_adjusted"] = score_adjusted_train | ||||||
| 
 | 
 | ||||||
|  | print("The score was successfully adjusted") | ||||||
|  | MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean() | ||||||
|  | MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean() | ||||||
|  | print(f"MAE for score : {MAE_score}") | ||||||
|  | print(f"MAE for adjusted score : {MAE_ajusted_score}") | ||||||
| 
 | 
 | ||||||
| ### 1. plot adjusted scores and save (to be tested) | ### 1. plot adjusted scores and save (to be tested) | ||||||
| plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) | plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) | ||||||
| save_file_s3_ca("hist_score_adjusted_", type_of_activity) | save_file_s3_ca("hist_score_adjusted_", type_of_activity) | ||||||
| 
 | 
 | ||||||
| """ |  | ||||||
| image_buffer = io.BytesIO() |  | ||||||
| plt.savefig(image_buffer, format='png') |  | ||||||
| image_buffer.seek(0) |  | ||||||
| file_name = "hist_score_adjusted" |  | ||||||
| FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png" |  | ||||||
| with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: |  | ||||||
|     s3_file.write(image_buffer.read()) |  | ||||||
| plt.close() |  | ||||||
| """ |  | ||||||
| 
 | 
 | ||||||
| ### 2. comparison between score and adjusted score | ### 2. comparison between score and adjusted score | ||||||
| X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() | X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() | ||||||
|  | @ -75,11 +76,14 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # project revenue | # project revenue | ||||||
| X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12) | X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",  | ||||||
|  |                                      duration_ref=17, duration_projection=12) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ### 3. table summarizing projections (nb tickets, revenue) | ### 3. table summarizing projections (nb tickets, revenue) | ||||||
| X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2) | X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",  | ||||||
|  |                     nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",  | ||||||
|  |                     total_amount="total_amount", pace_purchase="pace_purchase"),2) | ||||||
| 
 | 
 | ||||||
| # rename columns | # rename columns | ||||||
| mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns} | mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns} | ||||||
|  |  | ||||||
|  | @ -2558,46 +2558,10 @@ | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 60, |    "execution_count": 67, | ||||||
|    "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", |    "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [ |    "outputs": [ | ||||||
|     { |  | ||||||
|      "name": "stderr", |  | ||||||
|      "output_type": "stream", |  | ||||||
|      "text": [ |  | ||||||
|       "/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n", |  | ||||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", |  | ||||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", |  | ||||||
|       "\n", |  | ||||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |  | ||||||
|       "  df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", |  | ||||||
|       "/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n", |  | ||||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", |  | ||||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", |  | ||||||
|       "\n", |  | ||||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |  | ||||||
|       "  df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", |  | ||||||
|       "/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n", |  | ||||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", |  | ||||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", |  | ||||||
|       "\n", |  | ||||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |  | ||||||
|       "  df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", |  | ||||||
|       "/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n", |  | ||||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", |  | ||||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", |  | ||||||
|       "\n", |  | ||||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |  | ||||||
|       "  df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", |  | ||||||
|       "/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n", |  | ||||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", |  | ||||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", |  | ||||||
|       "\n", |  | ||||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |  | ||||||
|       "  df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n" |  | ||||||
|      ] |  | ||||||
|     }, |  | ||||||
|     { |     { | ||||||
|      "data": { |      "data": { | ||||||
|       "text/html": [ |       "text/html": [ | ||||||
|  | @ -2994,7 +2958,7 @@ | ||||||
|        "[151874 rows x 27 columns]" |        "[151874 rows x 27 columns]" | ||||||
|       ] |       ] | ||||||
|      }, |      }, | ||||||
|      "execution_count": 60, |      "execution_count": 67, | ||||||
|      "metadata": {}, |      "metadata": {}, | ||||||
|      "output_type": "execute_result" |      "output_type": "execute_result" | ||||||
|     } |     } | ||||||
|  | @ -3041,7 +3005,7 @@ | ||||||
|     "# generalization with a function\n", |     "# generalization with a function\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", |     "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", | ||||||
|     "                       duration_ref=1.5, duration_projection=1) :\n", |     "                       duration_ref=17, duration_projection=12) :\n", | ||||||
|     "    \n", |     "    \n", | ||||||
|     "    # compute nb tickets estimated and total amount expected\n", |     "    # compute nb tickets estimated and total amount expected\n", | ||||||
|     "    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", |     "    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", | ||||||
|  | @ -3066,7 +3030,7 @@ | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 65, |    "execution_count": 68, | ||||||
|    "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", |    "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [ |    "outputs": [ | ||||||
|  | @ -3159,7 +3123,7 @@ | ||||||
|        "3                    75.38          11.48  " |        "3                    75.38          11.48  " | ||||||
|       ] |       ] | ||||||
|      }, |      }, | ||||||
|      "execution_count": 65, |      "execution_count": 68, | ||||||
|      "metadata": {}, |      "metadata": {}, | ||||||
|      "output_type": "execute_result" |      "output_type": "execute_result" | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -123,7 +123,7 @@ def adjusted_score(odd_ratio, bias) : | ||||||
|     return adjusted_score |     return adjusted_score | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def find_bias(odd_ratios, y_objective, initial_guess=6) : | def find_bias(odd_ratios, y_objective, initial_guess=10) : | ||||||
|     """ |     """ | ||||||
|     Find the bias needed to adjust scores according to the purchases observed |     Find the bias needed to adjust scores according to the purchases observed | ||||||
| 
 | 
 | ||||||
|  | @ -136,7 +136,7 @@ def find_bias(odd_ratios, y_objective, initial_guess=6) : | ||||||
|     float: Estimated bias value. |     float: Estimated bias value. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6) |     bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess) | ||||||
|   |   | ||||||
|     return bias_estimated[0] |     return bias_estimated[0] | ||||||
|      |      | ||||||
|  | @ -198,7 +198,8 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust | ||||||
|     return df_output |     return df_output | ||||||
|      |      | ||||||
| 
 | 
 | ||||||
| def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :   | def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase, | ||||||
|  |                        duration_ref=17, duration_projection=12) :   | ||||||
|     """ |     """ | ||||||
|     Generate a summary of expected customer acquisition based on segments. |     Generate a summary of expected customer acquisition based on segments. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user