diff --git a/Sport/Modelization/CA_segment_sport.ipynb b/Sport/Modelization/CA_segment_sport.ipynb index c958e46..fad629d 100644 --- a/Sport/Modelization/CA_segment_sport.ipynb +++ b/Sport/Modelization/CA_segment_sport.ipynb @@ -18,12 +18,13 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 48, "id": "9771bf29-d08e-4674-8c23-9a2672fbef8f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", + "from pandas import DataFrame\n", "import numpy as np\n", "import os\n", "import s3fs\n", @@ -61,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514", "metadata": {}, "outputs": [], @@ -73,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028", "metadata": {}, "outputs": [], @@ -96,10 +97,69 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "2831d546-b365-498b-8248-c618bd9c3057", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_519/2459610029.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n" + ] + }, + { + "data": { + "text/plain": [ + "customer_id 0\n", + "nb_tickets 0\n", + "nb_purchases 0\n", + "total_amount 0\n", + "nb_suppliers 0\n", + "vente_internet_max 0\n", + "purchase_date_min 0\n", + "purchase_date_max 0\n", + "time_between_purchase 0\n", + "nb_tickets_internet 0\n", + "street_id 0\n", + "structure_id 222825\n", + "mcp_contact_id 70874\n", + "fidelity 0\n", + "tenant_id 0\n", + "is_partner 0\n", + "deleted_at 224213\n", + "gender 0\n", + "is_email_true 0\n", + "opt_in 0\n", + "last_buying_date 66139\n", + "max_price 66139\n", + "ticket_sum 0\n", + "average_price 66023\n", + "average_purchase_delay 66139\n", + "average_price_basket 66139\n", + "average_ticket_basket 66139\n", + "total_price 116\n", + "purchase_count 0\n", + "first_buying_date 66139\n", + "country 23159\n", + "gender_label 0\n", + "gender_female 0\n", + "gender_male 0\n", + "gender_other 0\n", + "country_fr 23159\n", + "nb_campaigns 0\n", + "nb_campaigns_opened 0\n", + "time_to_open 123159\n", + "y_has_purchased 0\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dataset_train, dataset_test = load_train_test()\n", "dataset_train.isna().sum()" @@ -107,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 6, "id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0", "metadata": {}, "outputs": [], @@ -135,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 7, "id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3", "metadata": {}, "outputs": [ @@ -164,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 8, "id": "7c81390e-598c-4f02-bd56-dd03b00dcb33", "metadata": {}, "outputs": [ @@ -503,7 +563,7 @@ "[96096 rows x 17 columns]" ] }, - "execution_count": 87, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -514,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 9, "id": "c708f439-bb75-4688-bf4f-4c04e13deaae", "metadata": {}, "outputs": [], @@ -532,24 +592,544 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 10, "id": "5261a803-05b8-41a0-968c-dc7bde48ddd3", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Exception ignored in: \n", - "Traceback (most recent call last):\n", - " File \"/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py\", line 1952, in __del__\n", - " self.close()\n", - " File \"/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py\", line 1929, in close\n", - " if not self.forced:\n", - " ^^^^^^^^^^^\n", - "AttributeError: 'S3File' object has no attribute 'forced'\n" - ] - }, + "data": { + "text/html": [ + "
GridSearchCV(cv=3, error_score='raise',\n",
+       "             estimator=Pipeline(steps=[('preprocessor',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         ['nb_tickets',\n",
+       "                                                                          'nb_purchases',\n",
+       "                                                                          'total_amount',\n",
+       "                                                                          'nb_suppliers',\n",
+       "                                                                          'vente_internet_max',\n",
+       "                                                                          'purchase_date_min',\n",
+       "                                                                          'purchase_date_max',\n",
+       "                                                                          'time_between_purchase',\n",
+       "                                                                          'nb_tickets_internet',\n",
+       "                                                                          'nb_campaigns',\n",
+       "                                                                          'nb_...\n",
+       "       1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
+       "       2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
+       "       4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
+       "       6.400000e+01]),\n",
+       "                         'LogisticRegression_cv__class_weight': ['balanced',\n",
+       "                                                                 {0.0: 0.5837086520288036,\n",
+       "                                                                  1.0: 3.486549107420539}],\n",
+       "                         'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
+       "             scoring=make_scorer(recall_score, response_method='predict'))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=3, error_score='raise',\n", + " estimator=Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['nb_tickets',\n", + " 'nb_purchases',\n", + " 'total_amount',\n", + " 'nb_suppliers',\n", + " 'vente_internet_max',\n", + " 'purchase_date_min',\n", + " 'purchase_date_max',\n", + " 'time_between_purchase',\n", + " 'nb_tickets_internet',\n", + " 'nb_campaigns',\n", + " 'nb_...\n", + " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", + " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", + " 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n", + " 6.400000e+01]),\n", + " 'LogisticRegression_cv__class_weight': ['balanced',\n", + " {0.0: 0.5837086520288036,\n", + " 1.0: 3.486549107420539}],\n", + " 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n", + " scoring=make_scorer(recall_score, response_method='predict'))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logit_cv = load_model(\"sport\", \"LogisticRegression_cv\")\n", + "logit_cv" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f3e584d-c70d-4b45-b947-4414ff416e17", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -1068,541 +1648,7 @@ " scoring=make_scorer(recall_score, response_method='predict'))" ] }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logit_cv = load_model(\"sport\", \"LogisticRegression_cv\")\n", - "logit_cv" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "6f3e584d-c70d-4b45-b947-4414ff416e17", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
GridSearchCV(cv=3, error_score='raise',\n",
-       "             estimator=Pipeline(steps=[('preprocessor',\n",
-       "                                        ColumnTransformer(transformers=[('num',\n",
-       "                                                                         Pipeline(steps=[('scaler',\n",
-       "                                                                                          StandardScaler())]),\n",
-       "                                                                         ['nb_tickets',\n",
-       "                                                                          'nb_purchases',\n",
-       "                                                                          'total_amount',\n",
-       "                                                                          'nb_suppliers',\n",
-       "                                                                          'vente_internet_max',\n",
-       "                                                                          'purchase_date_min',\n",
-       "                                                                          'purchase_date_max',\n",
-       "                                                                          'time_between_purchase',\n",
-       "                                                                          'nb_tickets_internet',\n",
-       "                                                                          'nb_campaigns',\n",
-       "                                                                          'nb_...\n",
-       "       1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
-       "       2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
-       "       4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
-       "       6.400000e+01]),\n",
-       "                         'LogisticRegression_cv__class_weight': ['balanced',\n",
-       "                                                                 {0.0: 0.5837086520288036,\n",
-       "                                                                  1.0: 3.486549107420539}],\n",
-       "                         'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
-       "             scoring=make_scorer(recall_score, response_method='predict'))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "GridSearchCV(cv=3, error_score='raise',\n", - " estimator=Pipeline(steps=[('preprocessor',\n", - " ColumnTransformer(transformers=[('num',\n", - " Pipeline(steps=[('scaler',\n", - " StandardScaler())]),\n", - " ['nb_tickets',\n", - " 'nb_purchases',\n", - " 'total_amount',\n", - " 'nb_suppliers',\n", - " 'vente_internet_max',\n", - " 'purchase_date_min',\n", - " 'purchase_date_max',\n", - " 'time_between_purchase',\n", - " 'nb_tickets_internet',\n", - " 'nb_campaigns',\n", - " 'nb_...\n", - " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", - " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", - " 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n", - " 6.400000e+01]),\n", - " 'LogisticRegression_cv__class_weight': ['balanced',\n", - " {0.0: 0.5837086520288036,\n", - " 1.0: 3.486549107420539}],\n", - " 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n", - " scoring=make_scorer(recall_score, response_method='predict'))" - ] - }, - "execution_count": 81, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1621,7 +1667,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 12, "id": "018d8ff4-3436-4eec-8507-d1a265cbabf1", "metadata": {}, "outputs": [], @@ -1632,7 +1678,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 13, "id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba", "metadata": {}, "outputs": [ @@ -1640,25 +1686,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_620/375041546.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/375041546.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"has_purchased\"] = y_test\n", - "/tmp/ipykernel_620/375041546.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/375041546.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"has_purchased_estim\"] = y_pred\n", - "/tmp/ipykernel_620/375041546.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/375041546.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"score\"] = y_pred_prob\n", - "/tmp/ipykernel_620/375041546.py:6: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/375041546.py:6: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -2020,7 +2066,7 @@ "[10 rows x 21 columns]" ] }, - "execution_count": 90, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -2073,7 +2119,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 14, "id": "f0379536-a6c5-4b16-bde5-d0319ec1b140", "metadata": {}, "outputs": [], @@ -2086,7 +2132,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 15, "id": "32a0dfd0-f49d-4785-a56f-706d381bfe41", "metadata": {}, "outputs": [], @@ -2102,7 +2148,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 16, "id": "2dff1def-02df-413e-afce-b4aeaf7752b6", "metadata": {}, "outputs": [], @@ -2113,7 +2159,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 17, "id": "683d71fc-7442-4028-869c-49c57592d6e9", "metadata": {}, "outputs": [], @@ -2136,7 +2182,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 18, "id": "781b0d40-c954-4c54-830a-e709c8667328", "metadata": {}, "outputs": [ @@ -2146,7 +2192,7 @@ "6.172331113516847" ] }, - "execution_count": 98, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2162,7 +2208,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 19, "id": "248cb862-418e-4767-9933-70c4885ecf40", "metadata": {}, "outputs": [ @@ -2172,7 +2218,7 @@ "6.070461139075353" ] }, - "execution_count": 102, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2189,7 +2235,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 20, "id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf", "metadata": {}, "outputs": [ @@ -2207,7 +2253,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 21, "id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c", "metadata": {}, "outputs": [ @@ -2230,7 +2276,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 22, "id": "8213d0e4-063b-49fa-90b7-677fc34f4c01", "metadata": {}, "outputs": [ @@ -2238,7 +2284,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_620/1825363704.py:7: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/1825363704.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -2259,7 +2305,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 23, "id": "834d3723-2e72-4c65-9c62-e2d595c69461", "metadata": {}, "outputs": [ @@ -2288,36 +2334,7 @@ }, { "cell_type": "code", - "execution_count": 130, - "id": "ed27a165-68d2-44f8-8cec-b12dad2cca5d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "29169.0" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_test_segment[\"has_purchased_estim\"].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "761146b7-3d0d-44b1-8b91-87e6d54f1626", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 119, + "execution_count": 25, "id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70", "metadata": {}, "outputs": [ @@ -2341,7 +2358,7 @@ }, { "cell_type": "code", - "execution_count": 208, + "execution_count": 26, "id": "6f9396db-e213-408c-a596-eaeec3bc79f3", "metadata": {}, "outputs": [ @@ -2361,7 +2378,7 @@ "\n", "# histogramme des probas et des probas ajustées\n", "\n", - "def plot_comp_scores(df, score, score_adjusted) :\n", + "def plot_hist_scores(df, score, score_adjusted) :\n", "\n", " plt.figure()\n", " plt.hist(df[score], label = \"score\", alpha=0.6)\n", @@ -2372,7 +2389,7 @@ " plt.title(\"Comparison between score and adjusted score\")\n", " plt.show()\n", "\n", - "plot_comp_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\")" + "plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\")" ] }, { @@ -2385,7 +2402,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 27, "id": "c618cebc-c295-47f7-bd76-b7e18778a17c", "metadata": {}, "outputs": [ @@ -2598,7 +2615,7 @@ "[5 rows x 22 columns]" ] }, - "execution_count": 121, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -2609,7 +2626,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 28, "id": "29633dd2-8b4b-48dc-be02-52f4015e686d", "metadata": {}, "outputs": [ @@ -2683,7 +2700,7 @@ "4 0.905216 0.661997 0.650133" ] }, - "execution_count": 156, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2694,31 +2711,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "9c64085e-51f2-4bad-8a37-274905bbed2e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e29be2a8-ef9f-4004-ae67-cab66eea0013", - "metadata": {}, - "outputs": [], - "source": [ - "X_test_segment[\"nb_tickets_projected\"] = X_test_segment[\"nb_tickets\"] / 1.5\n", - "X_test_segment[\"total_amount_projected\"] = X_test_segment[\"total_amount\"] / 1.5\n", - "\n", - "X_test_segment[\"nb_tickets_expected\"] = X_test_segment[\"score_adjusted\"] * X_test_segment[\"nb_tickets_projected\"]\n", - "X_test_segment[\"total_amount_expected\"] = X_test_segment[\"score_adjusted\"] * X_test_segment[\"total_amount_projected\"]\n", - "\n", - "X_test_segment" - ] - }, - { - "cell_type": "code", - "execution_count": 123, + "execution_count": 57, "id": "a974589f-7952-4db2-bebf-7b69c6b09372", "metadata": {}, "outputs": [], @@ -3151,7 +3144,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 56, "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", "metadata": {}, "outputs": [ @@ -3159,25 +3152,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_620/3599949626.py:7: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/3509011500.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output[\"nb_tickets_projected\"] = df_output[nb_tickets] / duration_ratio\n", - "/tmp/ipykernel_620/3599949626.py:8: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/3509011500.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output[\"total_amount_projected\"] = df_output[total_amount] / duration_ratio\n", - "/tmp/ipykernel_620/3599949626.py:10: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/3509011500.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output[\"nb_tickets_expected\"] = df_output[score_adjusted] * df_output[\"nb_tickets_projected\"]\n", - "/tmp/ipykernel_620/3599949626.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_519/3509011500.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -3581,7 +3574,7 @@ "[96096 rows x 26 columns]" ] }, - "execution_count": 127, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -3591,417 +3584,6 @@ "X_test_segment" ] }, - { - "cell_type": "code", - "execution_count": 135, - "id": "5bf8def7-d6f3-4b5b-a656-d61f6dca9536", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelity...nb_campaigns_openedhas_purchasedhas_purchased_estimscorequartilescore_adjustednb_tickets_projectedtotal_amount_projectednb_tickets_expectedtotal_amount_expected
04.01.0100.001.00.05.1771875.1771870.0000000.01...0.00.01.00.65767130.2403972.66666766.6666670.64105916.026472
11.01.055.001.00.0426.265613426.2656130.0000000.02...0.01.00.00.26653820.0564820.66666736.6666670.0376552.071006
217.01.080.001.00.0436.033437436.0334370.0000000.02...0.00.00.00.21466810.04308911.33333353.3333330.4883402.298068
34.01.0120.001.00.05.1964125.1964120.0000000.01...0.00.01.00.65777030.2404782.66666780.0000000.64127319.238202
434.02.0416.001.00.0478.693148115.631470363.0616780.04...0.01.01.00.89417340.58192022.666667277.33333313.190183161.385771
..................................................................
960911.01.067.311.01.0278.442257278.4422570.0000001.02...5.01.01.00.62355130.2143690.66666744.8733330.1429139.619467
960921.01.061.411.01.0189.207373189.2073730.0000001.01...9.00.01.00.68252130.2615260.66666740.9400000.17435110.706885
960930.00.00.000.00.0550.000000550.000000-1.0000000.01...3.00.00.00.11719210.0214000.0000000.0000000.0000000.000000
960941.01.079.431.01.0279.312905279.3129050.0000001.01...4.00.01.00.62518530.2155450.66666752.9533330.14369711.413840
960950.00.00.000.00.0550.000000550.000000-1.0000000.02...4.00.00.00.31958520.0718170.0000000.0000000.0000000.000000
\n", - "

96096 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " nb_tickets nb_purchases total_amount nb_suppliers \\\n", - "0 4.0 1.0 100.00 1.0 \n", - "1 1.0 1.0 55.00 1.0 \n", - "2 17.0 1.0 80.00 1.0 \n", - "3 4.0 1.0 120.00 1.0 \n", - "4 34.0 2.0 416.00 1.0 \n", - "... ... ... ... ... \n", - "96091 1.0 1.0 67.31 1.0 \n", - "96092 1.0 1.0 61.41 1.0 \n", - "96093 0.0 0.0 0.00 0.0 \n", - "96094 1.0 1.0 79.43 1.0 \n", - "96095 0.0 0.0 0.00 0.0 \n", - "\n", - " vente_internet_max purchase_date_min purchase_date_max \\\n", - "0 0.0 5.177187 5.177187 \n", - "1 0.0 426.265613 426.265613 \n", - "2 0.0 436.033437 436.033437 \n", - "3 0.0 5.196412 5.196412 \n", - "4 0.0 478.693148 115.631470 \n", - "... ... ... ... \n", - "96091 1.0 278.442257 278.442257 \n", - "96092 1.0 189.207373 189.207373 \n", - "96093 0.0 550.000000 550.000000 \n", - "96094 1.0 279.312905 279.312905 \n", - "96095 0.0 550.000000 550.000000 \n", - "\n", - " time_between_purchase nb_tickets_internet fidelity ... \\\n", - "0 0.000000 0.0 1 ... \n", - "1 0.000000 0.0 2 ... \n", - "2 0.000000 0.0 2 ... \n", - "3 0.000000 0.0 1 ... \n", - "4 363.061678 0.0 4 ... \n", - "... ... ... ... ... \n", - "96091 0.000000 1.0 2 ... \n", - "96092 0.000000 1.0 1 ... \n", - "96093 -1.000000 0.0 1 ... \n", - "96094 0.000000 1.0 1 ... \n", - "96095 -1.000000 0.0 2 ... \n", - "\n", - " nb_campaigns_opened has_purchased has_purchased_estim score \\\n", - "0 0.0 0.0 1.0 0.657671 \n", - "1 0.0 1.0 0.0 0.266538 \n", - "2 0.0 0.0 0.0 0.214668 \n", - "3 0.0 0.0 1.0 0.657770 \n", - "4 0.0 1.0 1.0 0.894173 \n", - "... ... ... ... ... \n", - "96091 5.0 1.0 1.0 0.623551 \n", - "96092 9.0 0.0 1.0 0.682521 \n", - "96093 3.0 0.0 0.0 0.117192 \n", - "96094 4.0 0.0 1.0 0.625185 \n", - "96095 4.0 0.0 0.0 0.319585 \n", - "\n", - " quartile score_adjusted nb_tickets_projected total_amount_projected \\\n", - "0 3 0.240397 2.666667 66.666667 \n", - "1 2 0.056482 0.666667 36.666667 \n", - "2 1 0.043089 11.333333 53.333333 \n", - "3 3 0.240478 2.666667 80.000000 \n", - "4 4 0.581920 22.666667 277.333333 \n", - "... ... ... ... ... \n", - "96091 3 0.214369 0.666667 44.873333 \n", - "96092 3 0.261526 0.666667 40.940000 \n", - "96093 1 0.021400 0.000000 0.000000 \n", - "96094 3 0.215545 0.666667 52.953333 \n", - "96095 2 0.071817 0.000000 0.000000 \n", - "\n", - " nb_tickets_expected total_amount_expected \n", - "0 0.641059 16.026472 \n", - "1 0.037655 2.071006 \n", - "2 0.488340 2.298068 \n", - "3 0.641273 19.238202 \n", - "4 13.190183 161.385771 \n", - "... ... ... \n", - "96091 0.142913 9.619467 \n", - "96092 0.174351 10.706885 \n", - "96093 0.000000 0.000000 \n", - "96094 0.143697 11.413840 \n", - "96095 0.000000 0.000000 \n", - "\n", - "[96096 rows x 26 columns]" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_test_segment" - ] - }, { "cell_type": "code", "execution_count": 169, @@ -4118,6 +3700,80 @@ "X_test_expected_CA" ] }, + { + "cell_type": "code", + "execution_count": 31, + "id": "f58f9151-2f91-45df-abb7-1ddcf0652adc", + "metadata": {}, + "outputs": [], + "source": [ + "# generalization with a function\n", + "\n", + "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :\n", + " \n", + " # compute nb tickets estimated and total amount expected\n", + " df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", + " \n", + " # number of customers by segment\n", + " df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n", + " \n", + " # size in percent of all customers\n", + " df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n", + " \n", + " # compute share of CA recovered\n", + " duration_ref=1.5\n", + " duration_projection=1\n", + " duration_ratio=duration_ref/duration_projection\n", + " \n", + " df_expected_CA[\"perct_revenue_recovered\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n", + " df.groupby(segment)[total_amount].sum().values\n", + " \n", + " return df_expected_CA" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", + "metadata": {}, + "outputs": [], + "source": [ + "round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n", + " total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d45dbf34-26f4-4340-91b9-ab6389b5466f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88ea1b3d-01ba-4edf-aecf-0a6747a86ca6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26cc273c-17b5-4f46-89e9-773092d6e53a", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc", diff --git a/utils_CA_segment.py b/utils_CA_segment.py new file mode 100644 index 0000000..a1ca94d --- /dev/null +++ b/utils_CA_segment.py @@ -0,0 +1,148 @@ +def odd_ratio(score) : + """ + Calculate the odd ratio from a score. + + Args: + - score (Union[float, int]): Score value. + + Returns: + float: Odd ratio value. + """ + + return score / (1 - score) + + +def adjust_score_1(score) : + """ + Adjust scores by replacing ones with the second highest value. + Allows to compute odd ratios then. + + Args: + - score (List[Union[float, int]]): List of score values. + + Returns: + np.ndarray: Adjusted score values. + """ + + second_best_score = np.array([element for element in score if element !=1]).max() + new_score = np.array([element if element!=1 else second_best_score for element in score]) + return new_score + + +def adjusted_score(odd_ratio, bias) : + """ + Adjust the score based on the odd ratio and bias. + + Args: + - odd_ratio (Union[float, int]): Odd ratio value. + - bias (Union[float, int]): Bias value. + + Returns: + float: Adjusted score value. + """ + + adjusted_score = odd_ratio/(bias+odd_ratio) + return adjusted_score + + +def find_bias(odd_ratios, y_objective, initial_guess=6) : + """ + Find the bias needed to adjust scores according to the purchases observed + + Args: + - odd_ratios (List[float]): List of odd ratios. + - y_objective (Union[float, int]): Objective value to achieve. + - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6. + + Returns: + float: Estimated bias value. + """ + + bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6) + + return bias_estimated[0] + + +def plot_hist_scores(df, score, score_adjusted) : + """ + Plot a histogram comparing scores and adjusted scores. + + Args: + - df (DataFrame): DataFrame containing score data. + - score (str): Name of the column in df representing the original scores. + - score_adjusted (str): Name of the column in df representing the adjusted scores. + + Returns: + None + """ + + plt.figure() + plt.hist(df[score], label = "score", alpha=0.6) + plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6) + plt.legend() + plt.xlabel("probability of a future purchase") + plt.ylabel("count") + plt.title("Comparison between score and adjusted score") + plt.show() + +def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : + """ + Project ticket counts and total amount for a given duration and adjust based on a score. + + Args: + - df (DataFrame): DataFrame containing ticket data. + - nb_tickets (str): Name of the column in df representing the number of tickets. + - total_amount (str): Name of the column in df representing the total amount. + - score_adjusted (str): Name of the column in df representing the adjusted score. + - duration_ref (int or float): Reference duration for the project. + - duration_projection (int or float): Duration for which the projection is made. + + Returns: + DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score. + duration_ratio = duration_ref/duration_projection + """ + + df_output = df + + df_output["nb_tickets_projected"] = df_output[nb_tickets] / duration_ratio + df_output["total_amount_projected"] = df_output[total_amount] / duration_ratio + + df_output["nb_tickets_expected"] = df_output[score_adjusted] * df_output["nb_tickets_projected"] + df_output["total_amount_expected"] = df_output[score_adjusted] * df_output["total_amount_projected"] + + return df_output + + +def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) : + """ + Generate a summary of expected customer acquisition based on segments. + + Args: + - df (DataFrame): DataFrame containing customer data. + - segment (str): Name of the column in df representing customer segments. + - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets. + - total_amount_expected (str): Name of the column in df representing the expected total amount. + - total_amount (str): Name of the column in df representing the total amount. + + Returns: + DataFrame: Summary DataFrame containing expected customer acquisition metrics. + """ + + # compute nb tickets estimated and total amount expected + df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index() + + # number of customers by segment + df_expected_CA.insert(1, "size", df.groupby(segment).size().values) + + # size in percent of all customers + df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum()) + + # compute share of CA recovered + duration_ref=1.5 + duration_projection=1 + duration_ratio=duration_ref/duration_projection + + df_expected_CA["perct_revenue_recovered"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \ + df.groupby(segment)[total_amount].sum().values + + return df_expected_CA