completed segment mp analysis sport

This commit is contained in:
Thomas PIQUE 2024-03-26 11:20:03 +00:00
parent dbd87dadd9
commit 2165c7c16e
3 changed files with 1893 additions and 152 deletions

View File

@ -2049,7 +2049,7 @@
"source": [ "source": [
"# comparison between score and adjusted score - export csv associated\n", "# comparison between score and adjusted score - export csv associated\n",
"\n", "\n",
"file_name = \"table_adjusted_score\"\n", "file_name = \"table_adjusted_score_\"\n",
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_table_adjusted_scores.to_csv(file_out, index = False)" " X_test_table_adjusted_scores.to_csv(file_out, index = False)"
@ -2057,12 +2057,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 77,
"id": "a974589f-7952-4db2-bebf-7b69c6b09372", "id": "a974589f-7952-4db2-bebf-7b69c6b09372",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n", "def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n",
" \n", " \n",
" duration_ratio = duration_ref/duration_projection\n", " duration_ratio = duration_ref/duration_projection\n",
"\n", "\n",
@ -2074,45 +2074,17 @@
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", " df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", " df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"\n", "\n",
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n",
" \n",
" return df_output\n" " return df_output\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 79,
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_521/3689439025.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
"/tmp/ipykernel_521/3689439025.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
"/tmp/ipykernel_521/3689439025.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
"/tmp/ipykernel_521/3689439025.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
@ -2145,7 +2117,6 @@
" <th>nb_tickets_internet</th>\n", " <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n", " <th>fidelity</th>\n",
" <th>...</th>\n", " <th>...</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n", " <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n", " <th>has_purchased_estim</th>\n",
" <th>score</th>\n", " <th>score</th>\n",
@ -2155,6 +2126,7 @@
" <th>total_amount_projected</th>\n", " <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n", " <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n", " <th>total_amount_expected</th>\n",
" <th>pace_purchase</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
@ -2172,15 +2144,15 @@
" <td>1</td>\n", " <td>1</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.657671</td>\n", " <td>0.657671</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>0.240397</td>\n", " <td>0.240397</td>\n",
" <td>2.666667</td>\n", " <td>2.823529</td>\n",
" <td>66.666667</td>\n", " <td>70.588235</td>\n",
" <td>0.641059</td>\n", " <td>0.678768</td>\n",
" <td>16.026472</td>\n", " <td>16.969205</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
@ -2195,16 +2167,16 @@
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.266538</td>\n", " <td>0.266538</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>0.056482</td>\n", " <td>0.056482</td>\n",
" <td>0.666667</td>\n", " <td>0.705882</td>\n",
" <td>36.666667</td>\n", " <td>38.823529</td>\n",
" <td>0.037655</td>\n", " <td>0.039870</td>\n",
" <td>2.071006</td>\n", " <td>2.192830</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>2</th>\n", " <th>2</th>\n",
@ -2221,14 +2193,14 @@
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.214668</td>\n", " <td>0.214668</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>0.043089</td>\n", " <td>0.043089</td>\n",
" <td>11.333333</td>\n", " <td>12.000000</td>\n",
" <td>53.333333</td>\n", " <td>56.470588</td>\n",
" <td>0.488340</td>\n", " <td>0.517065</td>\n",
" <td>2.298068</td>\n", " <td>2.433249</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
@ -2244,15 +2216,15 @@
" <td>1</td>\n", " <td>1</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.657770</td>\n", " <td>0.657770</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>0.240478</td>\n", " <td>0.240478</td>\n",
" <td>2.666667</td>\n", " <td>2.823529</td>\n",
" <td>80.000000</td>\n", " <td>84.705882</td>\n",
" <td>0.641273</td>\n", " <td>0.678995</td>\n",
" <td>19.238202</td>\n", " <td>20.369861</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
@ -2267,16 +2239,16 @@
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.894173</td>\n", " <td>0.894173</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>0.581920</td>\n", " <td>0.581920</td>\n",
" <td>22.666667</td>\n", " <td>24.000000</td>\n",
" <td>277.333333</td>\n", " <td>293.647059</td>\n",
" <td>13.190183</td>\n", " <td>13.966076</td>\n",
" <td>161.385771</td>\n", " <td>170.879052</td>\n",
" <td>8.5</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>...</th>\n", " <th>...</th>\n",
@ -2315,16 +2287,16 @@
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.623551</td>\n", " <td>0.623551</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>0.214369</td>\n", " <td>0.214369</td>\n",
" <td>0.666667</td>\n", " <td>0.705882</td>\n",
" <td>44.873333</td>\n", " <td>47.512941</td>\n",
" <td>0.142913</td>\n", " <td>0.151320</td>\n",
" <td>9.619467</td>\n", " <td>10.185318</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>96092</th>\n", " <th>96092</th>\n",
@ -2339,16 +2311,16 @@
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.682521</td>\n", " <td>0.682521</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>0.261526</td>\n", " <td>0.261526</td>\n",
" <td>0.666667</td>\n", " <td>0.705882</td>\n",
" <td>40.940000</td>\n", " <td>43.348235</td>\n",
" <td>0.174351</td>\n", " <td>0.184607</td>\n",
" <td>10.706885</td>\n", " <td>11.336701</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>96093</th>\n", " <th>96093</th>\n",
@ -2363,7 +2335,6 @@
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.117192</td>\n", " <td>0.117192</td>\n",
@ -2373,6 +2344,7 @@
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>96094</th>\n", " <th>96094</th>\n",
@ -2387,16 +2359,16 @@
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>1.0</td>\n", " <td>1.0</td>\n",
" <td>0.625185</td>\n", " <td>0.625185</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>0.215545</td>\n", " <td>0.215545</td>\n",
" <td>0.666667</td>\n", " <td>0.705882</td>\n",
" <td>52.953333</td>\n", " <td>56.068235</td>\n",
" <td>0.143697</td>\n", " <td>0.152150</td>\n",
" <td>11.413840</td>\n", " <td>12.085242</td>\n",
" <td>17.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>96095</th>\n", " <th>96095</th>\n",
@ -2411,7 +2383,6 @@
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>0.319585</td>\n", " <td>0.319585</td>\n",
@ -2421,10 +2392,11 @@
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"<p>96096 rows × 26 columns</p>\n", "<p>96096 rows × 27 columns</p>\n",
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
@ -2467,68 +2439,95 @@
"96094 0.000000 1.0 1 ... \n", "96094 0.000000 1.0 1 ... \n",
"96095 -1.000000 0.0 2 ... \n", "96095 -1.000000 0.0 2 ... \n",
"\n", "\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n", " has_purchased has_purchased_estim score quartile score_adjusted \\\n",
"0 0.0 0.0 1.0 0.657671 \n", "0 0.0 1.0 0.657671 3 0.240397 \n",
"1 0.0 1.0 0.0 0.266538 \n", "1 1.0 0.0 0.266538 2 0.056482 \n",
"2 0.0 0.0 0.0 0.214668 \n", "2 0.0 0.0 0.214668 1 0.043089 \n",
"3 0.0 0.0 1.0 0.657770 \n", "3 0.0 1.0 0.657770 3 0.240478 \n",
"4 0.0 1.0 1.0 0.894173 \n", "4 1.0 1.0 0.894173 4 0.581920 \n",
"... ... ... ... ... \n", "... ... ... ... ... ... \n",
"96091 5.0 1.0 1.0 0.623551 \n", "96091 1.0 1.0 0.623551 3 0.214369 \n",
"96092 9.0 0.0 1.0 0.682521 \n", "96092 0.0 1.0 0.682521 3 0.261526 \n",
"96093 3.0 0.0 0.0 0.117192 \n", "96093 0.0 0.0 0.117192 1 0.021400 \n",
"96094 4.0 0.0 1.0 0.625185 \n", "96094 0.0 1.0 0.625185 3 0.215545 \n",
"96095 4.0 0.0 0.0 0.319585 \n", "96095 0.0 0.0 0.319585 2 0.071817 \n",
"\n", "\n",
" quartile score_adjusted nb_tickets_projected total_amount_projected \\\n", " nb_tickets_projected total_amount_projected nb_tickets_expected \\\n",
"0 3 0.240397 2.666667 66.666667 \n", "0 2.823529 70.588235 0.678768 \n",
"1 2 0.056482 0.666667 36.666667 \n", "1 0.705882 38.823529 0.039870 \n",
"2 1 0.043089 11.333333 53.333333 \n", "2 12.000000 56.470588 0.517065 \n",
"3 3 0.240478 2.666667 80.000000 \n", "3 2.823529 84.705882 0.678995 \n",
"4 4 0.581920 22.666667 277.333333 \n", "4 24.000000 293.647059 13.966076 \n",
"... ... ... ... ... \n", "... ... ... ... \n",
"96091 3 0.214369 0.666667 44.873333 \n", "96091 0.705882 47.512941 0.151320 \n",
"96092 3 0.261526 0.666667 40.940000 \n", "96092 0.705882 43.348235 0.184607 \n",
"96093 1 0.021400 0.000000 0.000000 \n", "96093 0.000000 0.000000 0.000000 \n",
"96094 3 0.215545 0.666667 52.953333 \n", "96094 0.705882 56.068235 0.152150 \n",
"96095 2 0.071817 0.000000 0.000000 \n", "96095 0.000000 0.000000 0.000000 \n",
"\n", "\n",
" nb_tickets_expected total_amount_expected \n", " total_amount_expected pace_purchase \n",
"0 0.641059 16.026472 \n", "0 16.969205 17.0 \n",
"1 0.037655 2.071006 \n", "1 2.192830 17.0 \n",
"2 0.488340 2.298068 \n", "2 2.433249 17.0 \n",
"3 0.641273 19.238202 \n", "3 20.369861 17.0 \n",
"4 13.190183 161.385771 \n", "4 170.879052 8.5 \n",
"... ... ... \n", "... ... ... \n",
"96091 0.142913 9.619467 \n", "96091 10.185318 17.0 \n",
"96092 0.174351 10.706885 \n", "96092 11.336701 17.0 \n",
"96093 0.000000 0.000000 \n", "96093 0.000000 NaN \n",
"96094 0.143697 11.413840 \n", "96094 12.085242 17.0 \n",
"96095 0.000000 0.000000 \n", "96095 0.000000 NaN \n",
"\n", "\n",
"[96096 rows x 26 columns]" "[96096 rows x 27 columns]"
] ]
}, },
"execution_count": 45, "execution_count": 79,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"X_test_segment = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score_adjusted\", duration_ref=1.5, duration_projection=1)\n", "X_test_segment = project_tickets_CA (X_test_segment, \"nb_purchases\", \"nb_tickets\", \"total_amount\", \"score_adjusted\", \n",
" duration_ref=17, duration_projection=12)\n",
"X_test_segment" "X_test_segment"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 46, "execution_count": 82,
"id": "cb66a8ea-65f7-460f-b3fc-ba76a3b91faa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 16.581057\n",
"2 15.840818\n",
"3 14.888091\n",
"4 4.830480\n",
"Name: pace_purchase, dtype: float64"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.groupby(\"quartile\")[\"pace_purchase\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "f58f9151-2f91-45df-abb7-1ddcf0652adc", "id": "f58f9151-2f91-45df-abb7-1ddcf0652adc",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# generalization with a function\n", "# generalization with a function\n",
"\n", "\n",
"def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount,\n", "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n",
" duration_ref=1.5, duration_projection=1) :\n", " duration_ref=1.5, duration_projection=1) :\n",
" \n", " \n",
" # compute nb tickets estimated and total amount expected\n", " # compute nb tickets estimated and total amount expected\n",
@ -2545,13 +2544,16 @@
" \n", " \n",
" df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n", " df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n",
" df.groupby(segment)[total_amount].sum().values\n", " df.groupby(segment)[total_amount].sum().values\n",
"\n",
" df_drop_null_pace = df.dropna(subset=[pace_purchase])\n",
" df_expected_CA[\"pace_purchase\"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values\n",
" \n", " \n",
" return df_expected_CA" " return df_expected_CA"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 47, "execution_count": 119,
"id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2582,6 +2584,7 @@
" <th>nb_tickets_expected</th>\n", " <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n", " <th>total_amount_expected</th>\n",
" <th>revenue_recovered_perct</th>\n", " <th>revenue_recovered_perct</th>\n",
" <th>pace_purchase</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
@ -2590,36 +2593,40 @@
" <td>1</td>\n", " <td>1</td>\n",
" <td>37410</td>\n", " <td>37410</td>\n",
" <td>38.93</td>\n", " <td>38.93</td>\n",
" <td>84.76</td>\n", " <td>89.75</td>\n",
" <td>1867.19</td>\n", " <td>1977.02</td>\n",
" <td>4.38</td>\n", " <td>4.64</td>\n",
" <td>16.58</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>29517</td>\n", " <td>29517</td>\n",
" <td>30.72</td>\n", " <td>30.72</td>\n",
" <td>2899.29</td>\n", " <td>3069.83</td>\n",
" <td>74461.02</td>\n", " <td>78841.08</td>\n",
" <td>9.85</td>\n", " <td>10.43</td>\n",
" <td>15.84</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>2</th>\n", " <th>2</th>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>20137</td>\n", " <td>20137</td>\n",
" <td>20.96</td>\n", " <td>20.96</td>\n",
" <td>10876.79</td>\n", " <td>11516.60</td>\n",
" <td>344286.66</td>\n", " <td>364538.82</td>\n",
" <td>22.84</td>\n", " <td>24.19</td>\n",
" <td>14.89</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>9032</td>\n", " <td>9032</td>\n",
" <td>9.40</td>\n", " <td>9.40</td>\n",
" <td>215194.83</td>\n", " <td>227853.35</td>\n",
" <td>9899417.81</td>\n", " <td>10481736.51</td>\n",
" <td>90.11</td>\n", " <td>95.41</td>\n",
" <td>4.83</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
@ -2627,43 +2634,44 @@
], ],
"text/plain": [ "text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n", " quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
"0 1 37410 38.93 84.76 1867.19 \n", "0 1 37410 38.93 89.75 1977.02 \n",
"1 2 29517 30.72 2899.29 74461.02 \n", "1 2 29517 30.72 3069.83 78841.08 \n",
"2 3 20137 20.96 10876.79 344286.66 \n", "2 3 20137 20.96 11516.60 364538.82 \n",
"3 4 9032 9.40 215194.83 9899417.81 \n", "3 4 9032 9.40 227853.35 10481736.51 \n",
"\n", "\n",
" revenue_recovered_perct \n", " revenue_recovered_perct pace_purchase \n",
"0 4.38 \n", "0 4.64 16.58 \n",
"1 9.85 \n", "1 10.43 15.84 \n",
"2 22.84 \n", "2 24.19 14.89 \n",
"3 90.11 " "3 95.41 4.83 "
] ]
}, },
"execution_count": 47, "execution_count": 119,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n", "X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n",
" total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)\n", " nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n",
" total_amount=\"total_amount\", pace_purchase=\"pace_purchase\"),2)\n",
"\n", "\n",
"X_test_expected_CA" "X_test_expected_CA"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 48, "execution_count": 120,
"id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad", "id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'\\\\begin{tabular}{lrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) \\\\\\\\\\n\\\\midrule\\n1 & 37410 & 38.930000 & 84.760000 & 1867.190000 & 4.380000 \\\\\\\\\\n2 & 29517 & 30.720000 & 2899.290000 & 74461.020000 & 9.850000 \\\\\\\\\\n3 & 20137 & 20.960000 & 10876.790000 & 344286.660000 & 22.840000 \\\\\\\\\\n4 & 9032 & 9.400000 & 215194.830000 & 9899417.810000 & 90.110000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'" "'\\\\begin{tabular}{lrrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) & pace purchase \\\\\\\\\\n\\\\midrule\\n1 & 37410 & 38.930000 & 89.750000 & 1977.020000 & 4.640000 & 16.580000 \\\\\\\\\\n2 & 29517 & 30.720000 & 3069.830000 & 78841.080000 & 10.430000 & 15.840000 \\\\\\\\\\n3 & 20137 & 20.960000 & 11516.600000 & 364538.820000 & 24.190000 & 14.890000 \\\\\\\\\\n4 & 9032 & 9.400000 & 227853.350000 & 10481736.510000 & 95.410000 & 4.830000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
] ]
}, },
"execution_count": 48, "execution_count": 120,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2677,14 +2685,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 122,
"id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3", "id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# export summary table to the MinIO storage\n", "# export summary table to the MinIO storage\n",
"\n", "\n",
"file_name = \"table_expected_CA\"\n", "file_name = \"table_expected_CA_\"\n",
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_expected_CA.to_csv(file_out, index = False)" " X_test_expected_CA.to_csv(file_out, index = False)"

File diff suppressed because one or more lines are too long

View File

@ -85,17 +85,18 @@ def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
# plt.show() # plt.show()
def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
""" """
Project ticket counts and total amount for a given duration and adjust based on a score. Project ticket counts and total amount for a given duration and adjust based on a score.
Args: Args:
- df (DataFrame): DataFrame containing ticket data. - df (DataFrame): DataFrame containing ticket data.
- nb_purchases (str) : Name of the column in df representing the number of purchases.
- nb_tickets (str): Name of the column in df representing the number of tickets. - nb_tickets (str): Name of the column in df representing the number of tickets.
- total_amount (str): Name of the column in df representing the total amount. - total_amount (str): Name of the column in df representing the total amount.
- score_adjusted (str): Name of the column in df representing the adjusted score. - score_adjusted (str): Name of the column in df representing the adjusted score.
- duration_ref (int or float): Reference duration for the project. - duration_ref (int or float): duration of the period of reference for the construction of the variables X.
- duration_projection (int or float): Duration for which the projection is made. - duration_projection (int or float): Duration of the period of projection of sales / revenue.
Returns: Returns:
DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score. DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
@ -112,6 +113,8 @@ def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_r
df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"] df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"] df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
return df_output return df_output
@ -144,5 +147,8 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \ df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
df.groupby(segment)[total_amount].sum().values df.groupby(segment)[total_amount].sum().values
df_drop_null_pace = df.dropna(subset=[pace_purchase])
df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
return df_expected_CA return df_expected_CA