Finalisation de la fonction pour créer un dataset

This commit is contained in:
Antoine JOUBREL 2024-02-12 21:08:35 +00:00
parent 5f621c2352
commit 197a847085
3 changed files with 516 additions and 67 deletions

View File

@ -1529,7 +1529,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 25,
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d", "id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1559,7 +1559,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 26,
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1593,7 +1593,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 27,
"id": "24537647-bc29-4777-9848-ac4120a4aa60", "id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1603,7 +1603,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 28,
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1639,7 +1639,7 @@
" <th>0</th>\n", " <th>0</th>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>0.0</td>\n", " <td>NaN</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -1660,14 +1660,14 @@
" <th>3</th>\n", " <th>3</th>\n",
" <td>5</td>\n", " <td>5</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>0.0</td>\n", " <td>NaN</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
" <td>6</td>\n", " <td>6</td>\n",
" <td>20</td>\n", " <td>20</td>\n",
" <td>0.0</td>\n", " <td>NaN</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
@ -1676,14 +1676,14 @@
], ],
"text/plain": [ "text/plain": [
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n", " customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
"0 2 4 0.0 NaT\n", "0 2 4 NaN NaT\n",
"1 3 222 124.0 1 days 00:28:30.169354838\n", "1 3 222 124.0 1 days 00:28:30.169354838\n",
"2 4 7 7.0 1 days 04:31:01.428571428\n", "2 4 7 7.0 1 days 04:31:01.428571428\n",
"3 5 4 0.0 NaT\n", "3 5 4 NaN NaT\n",
"4 6 20 0.0 NaT" "4 6 20 NaN NaT"
] ]
}, },
"execution_count": 29, "execution_count": 28,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1702,7 +1702,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 69, "execution_count": 29,
"id": "043303fe-e90f-4689-a2a9-5d690555a045", "id": "043303fe-e90f-4689-a2a9-5d690555a045",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1765,7 +1765,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 70, "execution_count": 30,
"id": "5882234a-1ed5-4269-87a6-0d75613476e3", "id": "5882234a-1ed5-4269-87a6-0d75613476e3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1775,7 +1775,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 31,
"id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5", "id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1811,6 +1811,8 @@
" <th>purchase_date_max</th>\n", " <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n", " <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n", " <th>nb_tickets_internet</th>\n",
" <th>name_event_types</th>\n",
" <th>avg_amount</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
@ -1827,6 +1829,8 @@
" <td>4.179306</td>\n", " <td>4.179306</td>\n",
" <td>3258.011562</td>\n", " <td>3258.011562</td>\n",
" <td>51.0</td>\n", " <td>51.0</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
@ -1841,6 +1845,8 @@
" <td>5.221840</td>\n", " <td>5.221840</td>\n",
" <td>3692.976389</td>\n", " <td>3692.976389</td>\n",
" <td>2988.0</td>\n", " <td>2988.0</td>\n",
" <td>spectacle vivant</td>\n",
" <td>7.762474</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>2</th>\n", " <th>2</th>\n",
@ -1855,6 +1861,8 @@
" <td>0.146331</td>\n", " <td>0.146331</td>\n",
" <td>3803.223461</td>\n", " <td>3803.223461</td>\n",
" <td>9.0</td>\n", " <td>9.0</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>4.452618</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
@ -1869,6 +1877,8 @@
" <td>1408.715532</td>\n", " <td>1408.715532</td>\n",
" <td>1093.999977</td>\n", " <td>1093.999977</td>\n",
" <td>5.0</td>\n", " <td>5.0</td>\n",
" <td>formule adhésion</td>\n",
" <td>6.439463</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
@ -1883,6 +1893,8 @@
" <td>1340.308160</td>\n", " <td>1340.308160</td>\n",
" <td>700.966389</td>\n", " <td>700.966389</td>\n",
" <td>0.0</td>\n", " <td>0.0</td>\n",
" <td>offre muséale individuel</td>\n",
" <td>6.150659</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
@ -1903,15 +1915,22 @@
"3 5 1 2502.715509 1408.715532 \n", "3 5 1 2502.715509 1408.715532 \n",
"4 1 0 2041.274549 1340.308160 \n", "4 1 0 2041.274549 1340.308160 \n",
"\n", "\n",
" time_between_purchase nb_tickets_internet \n", " time_between_purchase nb_tickets_internet name_event_types \\\n",
"0 3258.011562 51.0 \n", "0 3258.011562 51.0 offre muséale individuel \n",
"1 3692.976389 2988.0 \n", "1 3692.976389 2988.0 spectacle vivant \n",
"2 3803.223461 9.0 \n", "2 3803.223461 9.0 offre muséale groupe \n",
"3 1093.999977 5.0 \n", "3 1093.999977 5.0 formule adhésion \n",
"4 700.966389 0.0 " "4 700.966389 0.0 offre muséale individuel \n",
"\n",
" avg_amount \n",
"0 6.150659 \n",
"1 7.762474 \n",
"2 4.452618 \n",
"3 6.439463 \n",
"4 6.150659 "
] ]
}, },
"execution_count": 36, "execution_count": 31,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1922,7 +1941,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 32,
"id": "a4a2311d-8a72-4030-afd5-218004d5d2a5", "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1946,7 +1965,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 33,
"id": "83230baa-9a8a-4614-b629-e99c2505c696", "id": "83230baa-9a8a-4614-b629-e99c2505c696",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2159,7 +2178,7 @@
"[5 rows x 37 columns]" "[5 rows x 37 columns]"
] ]
}, },
"execution_count": 43, "execution_count": 33,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2173,7 +2192,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 34,
"id": "433921de-03ad-4024-9462-ecd267db1756", "id": "433921de-03ad-4024-9462-ecd267db1756",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2339,7 +2358,7 @@
" <td>formule adhésion</td>\n", " <td>formule adhésion</td>\n",
" <td>6.439463</td>\n", " <td>6.439463</td>\n",
" <td>4.0</td>\n", " <td>4.0</td>\n",
" <td>0.0</td>\n", " <td>NaN</td>\n",
" <td>NaT</td>\n", " <td>NaT</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
@ -2381,12 +2400,12 @@
"1 NaN NaT \n", "1 NaN NaT \n",
"2 NaN NaT \n", "2 NaN NaT \n",
"3 NaN NaT \n", "3 NaN NaT \n",
"4 0.0 NaT \n", "4 NaN NaT \n",
"\n", "\n",
"[5 rows x 40 columns]" "[5 rows x 40 columns]"
] ]
}, },
"execution_count": 44, "execution_count": 34,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2400,7 +2419,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 35,
"id": "25e54131-6835-4e94-86d3-1a78520ed7bc", "id": "25e54131-6835-4e94-86d3-1a78520ed7bc",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2426,7 +2445,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 46, "execution_count": 36,
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2468,7 +2487,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 63, "execution_count": 38,
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2482,7 +2501,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": 39,
"id": "d53825e4-6453-45bc-94f2-7b2504ec4afb", "id": "d53825e4-6453-45bc-94f2-7b2504ec4afb",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2688,7 +2707,7 @@
"[5 rows x 28 columns]" "[5 rows x 28 columns]"
] ]
}, },
"execution_count": 64, "execution_count": 39,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2699,7 +2718,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 67, "execution_count": 40,
"id": "1e42a790-b215-4107-a969-85005da06ebd", "id": "1e42a790-b215-4107-a969-85005da06ebd",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2713,28 +2732,394 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 66, "execution_count": 41,
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f", "id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>event_type_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>...</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>384226.0</td>\n",
" <td>194790.0</td>\n",
" <td>2686540.5</td>\n",
" <td>7.0</td>\n",
" <td>1.0</td>\n",
" <td>3262.190868</td>\n",
" <td>4.179306</td>\n",
" <td>3258.011562</td>\n",
" <td>...</td>\n",
" <td>1.956087</td>\n",
" <td>8821221.5</td>\n",
" <td>641472.0</td>\n",
" <td>2013-06-10 10:37:58+00:00</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>453242.0</td>\n",
" <td>228945.0</td>\n",
" <td>3248965.5</td>\n",
" <td>6.0</td>\n",
" <td>1.0</td>\n",
" <td>3698.198229</td>\n",
" <td>5.221840</td>\n",
" <td>3692.976389</td>\n",
" <td>...</td>\n",
" <td>1.956087</td>\n",
" <td>8821221.5</td>\n",
" <td>641472.0</td>\n",
" <td>2013-06-10 10:37:58+00:00</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>5.0</td>\n",
" <td>201750.0</td>\n",
" <td>107110.0</td>\n",
" <td>1459190.0</td>\n",
" <td>6.0</td>\n",
" <td>1.0</td>\n",
" <td>3803.369792</td>\n",
" <td>0.146331</td>\n",
" <td>3803.223461</td>\n",
" <td>...</td>\n",
" <td>1.956087</td>\n",
" <td>8821221.5</td>\n",
" <td>641472.0</td>\n",
" <td>2013-06-10 10:37:58+00:00</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>6.0</td>\n",
" <td>217356.0</td>\n",
" <td>111786.0</td>\n",
" <td>1435871.5</td>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>2502.715509</td>\n",
" <td>1408.715532</td>\n",
" <td>1093.999977</td>\n",
" <td>...</td>\n",
" <td>1.956087</td>\n",
" <td>8821221.5</td>\n",
" <td>641472.0</td>\n",
" <td>2013-06-10 10:37:58+00:00</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>143.0</td>\n",
" <td>143.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2041.274549</td>\n",
" <td>1340.308160</td>\n",
" <td>700.966389</td>\n",
" <td>...</td>\n",
" <td>1.000000</td>\n",
" <td>0.0</td>\n",
" <td>307.0</td>\n",
" <td>2018-04-07 12:55:07+00:00</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>156291</th>\n",
" <td>1256133</td>\n",
" <td>5.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>33.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.110521</td>\n",
" <td>0.110521</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>156292</th>\n",
" <td>1256134</td>\n",
" <td>5.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>44.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.092095</td>\n",
" <td>0.092095</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>156293</th>\n",
" <td>1256135</td>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>11.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.087894</td>\n",
" <td>0.087894</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>156294</th>\n",
" <td>1256136</td>\n",
" <td>5.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>22.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.040394</td>\n",
" <td>0.040394</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>156295</th>\n",
" <td>1256137</td>\n",
" <td>5.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>22.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>156296 rows × 40 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
"Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',\n", " customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
" 'total_amount', 'nb_suppliers', 'vente_internet_max',\n", "0 1 2.0 384226.0 194790.0 2686540.5 \n",
" 'purchase_date_min', 'purchase_date_max', 'time_between_purchase',\n", "1 1 4.0 453242.0 228945.0 3248965.5 \n",
" 'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',\n", "2 1 5.0 201750.0 107110.0 1459190.0 \n",
" 'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',\n", "3 1 6.0 217356.0 111786.0 1435871.5 \n",
" 'structure_id', 'profession', 'language', 'mcp_contact_id',\n", "4 2 2.0 143.0 143.0 0.0 \n",
" 'last_buying_date', 'max_price', 'ticket_sum', 'average_price',\n", "... ... ... ... ... ... \n",
" 'fidelity', 'average_purchase_delay', 'average_price_basket',\n", "156291 1256133 5.0 3.0 1.0 33.0 \n",
" 'average_ticket_basket', 'total_price', 'purchase_count',\n", "156292 1256134 5.0 4.0 1.0 44.0 \n",
" 'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',\n", "156293 1256135 5.0 1.0 1.0 11.0 \n",
" 'nb_campaigns_opened', 'time_to_open'],\n", "156294 1256136 5.0 2.0 1.0 22.0 \n",
" dtype='object')" "156295 1256137 5.0 2.0 1.0 22.0 \n",
"\n",
" nb_suppliers vente_internet_max purchase_date_min \\\n",
"0 7.0 1.0 3262.190868 \n",
"1 6.0 1.0 3698.198229 \n",
"2 6.0 1.0 3803.369792 \n",
"3 5.0 1.0 2502.715509 \n",
"4 1.0 0.0 2041.274549 \n",
"... ... ... ... \n",
"156291 1.0 1.0 0.110521 \n",
"156292 1.0 1.0 0.092095 \n",
"156293 1.0 1.0 0.087894 \n",
"156294 1.0 1.0 0.040394 \n",
"156295 1.0 1.0 0.000000 \n",
"\n",
" purchase_date_max time_between_purchase ... average_ticket_basket \\\n",
"0 4.179306 3258.011562 ... 1.956087 \n",
"1 5.221840 3692.976389 ... 1.956087 \n",
"2 0.146331 3803.223461 ... 1.956087 \n",
"3 1408.715532 1093.999977 ... 1.956087 \n",
"4 1340.308160 700.966389 ... 1.000000 \n",
"... ... ... ... ... \n",
"156291 0.110521 0.000000 ... NaN \n",
"156292 0.092095 0.000000 ... NaN \n",
"156293 0.087894 0.000000 ... NaN \n",
"156294 0.040394 0.000000 ... NaN \n",
"156295 0.000000 0.000000 ... NaN \n",
"\n",
" total_price purchase_count first_buying_date country age \\\n",
"0 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
"1 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
"2 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
"3 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
"4 0.0 307.0 2018-04-07 12:55:07+00:00 fr NaN \n",
"... ... ... ... ... .. \n",
"156291 NaN NaN NaT NaN NaN \n",
"156292 NaN NaN NaT NaN NaN \n",
"156293 NaN NaN NaT NaN NaN \n",
"156294 NaN NaN NaT NaN NaN \n",
"156295 NaN NaN NaT NaN NaN \n",
"\n",
" tenant_id nb_campaigns nb_campaigns_opened time_to_open \n",
"0 1311.0 0.0 0.0 NaT \n",
"1 1311.0 0.0 0.0 NaT \n",
"2 1311.0 0.0 0.0 NaT \n",
"3 1311.0 0.0 0.0 NaT \n",
"4 1311.0 4.0 0.0 NaT \n",
"... ... ... ... ... \n",
"156291 NaN NaN NaN NaT \n",
"156292 NaN NaN NaN NaT \n",
"156293 NaN NaN NaN NaT \n",
"156294 NaN NaN NaN NaT \n",
"156295 NaN NaN NaN NaT \n",
"\n",
"[156296 rows x 40 columns]"
] ]
}, },
"execution_count": 66, "execution_count": 41,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2745,7 +3130,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 68, "execution_count": 42,
"id": "ebf6d843-dcc0-4e83-b063-94806c0bac17", "id": "ebf6d843-dcc0-4e83-b063-94806c0bac17",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2778,7 +3163,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.13" "version": "3.11.6"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -34,6 +34,8 @@ for i in range(len(liste_database)) :
nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3) nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
globals()[nom_dataframe] = df globals()[nom_dataframe] = df
## 1 - Cleaning of the datasets
# Cleaning customerplus # Cleaning customerplus
df1_customerplus_clean = preprocessing_customerplus(df1_customersplus) df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
@ -61,29 +63,91 @@ df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_
# Selection des variables d'intérêts # Selection des variables d'intérêts
df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
# Fusion de l'ensemble et creation des KPI ## 2 - Construction of KPIs on a given period
df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information)
df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced) def explanatory_variables(min_date = "2021-09-01", max_date = "2023-09-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
# Fusion avec KPI liés au customer # Filtre de cohérence pour la mise en pratique de notre méthode
df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left') max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
# Fill NaN values #Filtre de la base df_campaigns_information
df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
# Fusion avec KPI liés au comportement d'achat #Filtre de la base df_products_purchased_reduced
df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer') df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
# Fill NaN values print("Data filtering : SUCCESS")
df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
# Fusion de l'ensemble et creation des KPI
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
# Fusion avec KPI liés au comportement d'achat
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
return df_customer_product
# Fonction pour créer les variables expliquée
def explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced):
# Filtrer la base d'achat
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
# Indicatrice d'achat
df_products_purchased_reduced['y_has_purchased'] = 1
y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
return y
## Exportation ## Exportation
# Exportation vers 'projet-bdc2324-team1' # Dossier d'exportation
BUCKET_OUT = "projet-bdc2324-team1" BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
FILE_KEY_OUT_S3 = "1_Output/Company 1 - Segmentation base.csv"
X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
FILE_KEY_OUT_S3 = "dataset_test.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
df1_customer_product.to_csv(file_out, index = False) dataset_test.to_csv(file_out, index = False)
print("Exportation dataset test : SUCCESS")
X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
FILE_KEY_OUT_S3 = "dataset_train.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_test.to_csv(file_out, index = False)
print("Exportation dataset train : SUCCESS")
# # Exportation vers 'projet-bdc2324-team1'
print("Exportation base de la base X d'entraînement : SUCCESS")

View File

@ -5,7 +5,7 @@ def campaigns_kpi_function(campaigns_information = None):
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture en min moyen # Temps d'ouverture en min moyen
campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at'] campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert # Nombre de mail ouvert