CA estimation by segment works well
This commit is contained in:
		
							parent
							
								
									d3e13f4c56
								
							
						
					
					
						commit
						10824e5e24
					
				|  | @ -10,15 +10,20 @@ import pickle | |||
| import warnings | ||||
| import io | ||||
| 
 | ||||
| # importation of functions defined | ||||
| from utils_CA_segment import * | ||||
| 
 | ||||
| # ignore warnings | ||||
| warnings.filterwarnings('ignore') | ||||
| 
 | ||||
| # Create filesystem object | ||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||
| fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) | ||||
| 
 | ||||
| # importation of functions defined | ||||
| exec(open('utils_CA_segment.py').read()) | ||||
| # from utils_CA_segment import * | ||||
| 
 | ||||
| # define type of activity  | ||||
| type_of_activity = "sport" | ||||
| type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||
| PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/" | ||||
| 
 | ||||
| # type of model for the score | ||||
|  | @ -41,27 +46,23 @@ X_train_score = model.predict_proba(X_train)[:, 1] | |||
| 
 | ||||
| bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),  | ||||
|                            y_objective = y_train["y_has_purchased"].sum(), | ||||
|                            initial_guess=6) | ||||
|                            initial_guess=10) | ||||
| print("Bias estimated :", np.log(bias_train_set)) | ||||
| 
 | ||||
| # create a score adjusted with the bias computed | ||||
| score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) | ||||
| X_test_segment["score_adjusted"] = score_adjusted_train | ||||
| 
 | ||||
| print("The score was successfully adjusted") | ||||
| MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean() | ||||
| MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean() | ||||
| print(f"MAE for score : {MAE_score}") | ||||
| print(f"MAE for adjusted score : {MAE_ajusted_score}") | ||||
| 
 | ||||
| ### 1. plot adjusted scores and save (to be tested) | ||||
| plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) | ||||
| save_file_s3_ca("hist_score_adjusted_", type_of_activity) | ||||
| 
 | ||||
| """ | ||||
| image_buffer = io.BytesIO() | ||||
| plt.savefig(image_buffer, format='png') | ||||
| image_buffer.seek(0) | ||||
| file_name = "hist_score_adjusted" | ||||
| FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png" | ||||
| with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: | ||||
|     s3_file.write(image_buffer.read()) | ||||
| plt.close() | ||||
| """ | ||||
| 
 | ||||
| ### 2. comparison between score and adjusted score | ||||
| X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() | ||||
|  | @ -75,11 +76,14 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | |||
| 
 | ||||
| 
 | ||||
| # project revenue | ||||
| X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12) | ||||
| X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",  | ||||
|                                      duration_ref=17, duration_projection=12) | ||||
| 
 | ||||
| 
 | ||||
| ### 3. table summarizing projections (nb tickets, revenue) | ||||
| X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2) | ||||
| X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",  | ||||
|                     nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",  | ||||
|                     total_amount="total_amount", pace_purchase="pace_purchase"),2) | ||||
| 
 | ||||
| # rename columns | ||||
| mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns} | ||||
|  |  | |||
|  | @ -2558,46 +2558,10 @@ | |||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 60, | ||||
|    "execution_count": 67, | ||||
|    "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n", | ||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", | ||||
|       "\n", | ||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||||
|       "  df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", | ||||
|       "/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n", | ||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", | ||||
|       "\n", | ||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||||
|       "  df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", | ||||
|       "/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n", | ||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", | ||||
|       "\n", | ||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||||
|       "  df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", | ||||
|       "/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n", | ||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", | ||||
|       "\n", | ||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||||
|       "  df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", | ||||
|       "/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n", | ||||
|       "A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||||
|       "Try using .loc[row_indexer,col_indexer] = value instead\n", | ||||
|       "\n", | ||||
|       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||||
|       "  df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/html": [ | ||||
|  | @ -2994,7 +2958,7 @@ | |||
|        "[151874 rows x 27 columns]" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 60, | ||||
|      "execution_count": 67, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
|  | @ -3041,7 +3005,7 @@ | |||
|     "# generalization with a function\n", | ||||
|     "\n", | ||||
|     "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", | ||||
|     "                       duration_ref=1.5, duration_projection=1) :\n", | ||||
|     "                       duration_ref=17, duration_projection=12) :\n", | ||||
|     "    \n", | ||||
|     "    # compute nb tickets estimated and total amount expected\n", | ||||
|     "    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", | ||||
|  | @ -3066,7 +3030,7 @@ | |||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 65, | ||||
|    "execution_count": 68, | ||||
|    "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|  | @ -3159,7 +3123,7 @@ | |||
|        "3                    75.38          11.48  " | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 65, | ||||
|      "execution_count": 68, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
|  |  | |||
|  | @ -123,7 +123,7 @@ def adjusted_score(odd_ratio, bias) : | |||
|     return adjusted_score | ||||
| 
 | ||||
| 
 | ||||
| def find_bias(odd_ratios, y_objective, initial_guess=6) : | ||||
| def find_bias(odd_ratios, y_objective, initial_guess=10) : | ||||
|     """ | ||||
|     Find the bias needed to adjust scores according to the purchases observed | ||||
| 
 | ||||
|  | @ -136,7 +136,7 @@ def find_bias(odd_ratios, y_objective, initial_guess=6) : | |||
|     float: Estimated bias value. | ||||
|     """ | ||||
| 
 | ||||
|     bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6) | ||||
|     bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess) | ||||
|   | ||||
|     return bias_estimated[0] | ||||
|      | ||||
|  | @ -198,7 +198,8 @@ def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjust | |||
|     return df_output | ||||
|      | ||||
| 
 | ||||
| def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :   | ||||
| def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase, | ||||
|                        duration_ref=17, duration_projection=12) :   | ||||
|     """ | ||||
|     Generate a summary of expected customer acquisition based on segments. | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user