diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb
index 5077370..e77968c 100644
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
@@ -1529,7 +1529,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 25,
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
"metadata": {},
"outputs": [],
@@ -1559,7 +1559,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 26,
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {},
"outputs": [],
@@ -1593,7 +1593,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 27,
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {},
"outputs": [],
@@ -1603,7 +1603,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 28,
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {},
"outputs": [
@@ -1639,7 +1639,7 @@
"
0 | \n",
" 2 | \n",
" 4 | \n",
- " 0.0 | \n",
+ " NaN | \n",
" NaT | \n",
" \n",
" \n",
@@ -1660,14 +1660,14 @@
" 3 | \n",
" 5 | \n",
" 4 | \n",
- " 0.0 | \n",
+ " NaN | \n",
" NaT | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" 20 | \n",
- " 0.0 | \n",
+ " NaN | \n",
" NaT | \n",
"
\n",
" \n",
@@ -1676,14 +1676,14 @@
],
"text/plain": [
" customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
- "0 2 4 0.0 NaT\n",
+ "0 2 4 NaN NaT\n",
"1 3 222 124.0 1 days 00:28:30.169354838\n",
"2 4 7 7.0 1 days 04:31:01.428571428\n",
- "3 5 4 0.0 NaT\n",
- "4 6 20 0.0 NaT"
+ "3 5 4 NaN NaT\n",
+ "4 6 20 NaN NaT"
]
},
- "execution_count": 29,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -1702,7 +1702,7 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 29,
"id": "043303fe-e90f-4689-a2a9-5d690555a045",
"metadata": {},
"outputs": [],
@@ -1765,7 +1765,7 @@
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 30,
"id": "5882234a-1ed5-4269-87a6-0d75613476e3",
"metadata": {},
"outputs": [],
@@ -1775,7 +1775,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 31,
"id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5",
"metadata": {},
"outputs": [
@@ -1811,6 +1811,8 @@
" purchase_date_max | \n",
" time_between_purchase | \n",
" nb_tickets_internet | \n",
+ " name_event_types | \n",
+ " avg_amount | \n",
" \n",
" \n",
" \n",
@@ -1827,6 +1829,8 @@
" 4.179306 | \n",
" 3258.011562 | \n",
" 51.0 | \n",
+ " offre muséale individuel | \n",
+ " 6.150659 | \n",
" \n",
" \n",
" 1 | \n",
@@ -1841,6 +1845,8 @@
" 5.221840 | \n",
" 3692.976389 | \n",
" 2988.0 | \n",
+ " spectacle vivant | \n",
+ " 7.762474 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -1855,6 +1861,8 @@
" 0.146331 | \n",
" 3803.223461 | \n",
" 9.0 | \n",
+ " offre muséale groupe | \n",
+ " 4.452618 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -1869,6 +1877,8 @@
" 1408.715532 | \n",
" 1093.999977 | \n",
" 5.0 | \n",
+ " formule adhésion | \n",
+ " 6.439463 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -1883,6 +1893,8 @@
" 1340.308160 | \n",
" 700.966389 | \n",
" 0.0 | \n",
+ " offre muséale individuel | \n",
+ " 6.150659 | \n",
"
\n",
" \n",
"\n",
@@ -1903,15 +1915,22 @@
"3 5 1 2502.715509 1408.715532 \n",
"4 1 0 2041.274549 1340.308160 \n",
"\n",
- " time_between_purchase nb_tickets_internet \n",
- "0 3258.011562 51.0 \n",
- "1 3692.976389 2988.0 \n",
- "2 3803.223461 9.0 \n",
- "3 1093.999977 5.0 \n",
- "4 700.966389 0.0 "
+ " time_between_purchase nb_tickets_internet name_event_types \\\n",
+ "0 3258.011562 51.0 offre muséale individuel \n",
+ "1 3692.976389 2988.0 spectacle vivant \n",
+ "2 3803.223461 9.0 offre muséale groupe \n",
+ "3 1093.999977 5.0 formule adhésion \n",
+ "4 700.966389 0.0 offre muséale individuel \n",
+ "\n",
+ " avg_amount \n",
+ "0 6.150659 \n",
+ "1 7.762474 \n",
+ "2 4.452618 \n",
+ "3 6.439463 \n",
+ "4 6.150659 "
]
},
- "execution_count": 36,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@@ -1922,7 +1941,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 32,
"id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
"metadata": {},
"outputs": [],
@@ -1946,7 +1965,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 33,
"id": "83230baa-9a8a-4614-b629-e99c2505c696",
"metadata": {},
"outputs": [
@@ -2159,7 +2178,7 @@
"[5 rows x 37 columns]"
]
},
- "execution_count": 43,
+ "execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@@ -2173,7 +2192,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 34,
"id": "433921de-03ad-4024-9462-ecd267db1756",
"metadata": {},
"outputs": [
@@ -2339,7 +2358,7 @@
" formule adhésion | \n",
" 6.439463 | \n",
" 4.0 | \n",
- " 0.0 | \n",
+ " NaN | \n",
" NaT | \n",
" \n",
" \n",
@@ -2381,12 +2400,12 @@
"1 NaN NaT \n",
"2 NaN NaT \n",
"3 NaN NaT \n",
- "4 0.0 NaT \n",
+ "4 NaN NaT \n",
"\n",
"[5 rows x 40 columns]"
]
},
- "execution_count": 44,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -2400,7 +2419,7 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 35,
"id": "25e54131-6835-4e94-86d3-1a78520ed7bc",
"metadata": {},
"outputs": [],
@@ -2426,7 +2445,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 36,
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
"metadata": {},
"outputs": [],
@@ -2468,7 +2487,7 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 38,
"id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
"metadata": {},
"outputs": [],
@@ -2482,7 +2501,7 @@
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 39,
"id": "d53825e4-6453-45bc-94f2-7b2504ec4afb",
"metadata": {},
"outputs": [
@@ -2688,7 +2707,7 @@
"[5 rows x 28 columns]"
]
},
- "execution_count": 64,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -2699,7 +2718,7 @@
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": 40,
"id": "1e42a790-b215-4107-a969-85005da06ebd",
"metadata": {},
"outputs": [],
@@ -2713,28 +2732,394 @@
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": 41,
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_id | \n",
+ " event_type_id | \n",
+ " nb_tickets | \n",
+ " nb_purchases | \n",
+ " total_amount | \n",
+ " nb_suppliers | \n",
+ " vente_internet_max | \n",
+ " purchase_date_min | \n",
+ " purchase_date_max | \n",
+ " time_between_purchase | \n",
+ " ... | \n",
+ " average_ticket_basket | \n",
+ " total_price | \n",
+ " purchase_count | \n",
+ " first_buying_date | \n",
+ " country | \n",
+ " age | \n",
+ " tenant_id | \n",
+ " nb_campaigns | \n",
+ " nb_campaigns_opened | \n",
+ " time_to_open | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 384226.0 | \n",
+ " 194790.0 | \n",
+ " 2686540.5 | \n",
+ " 7.0 | \n",
+ " 1.0 | \n",
+ " 3262.190868 | \n",
+ " 4.179306 | \n",
+ " 3258.011562 | \n",
+ " ... | \n",
+ " 1.956087 | \n",
+ " 8821221.5 | \n",
+ " 641472.0 | \n",
+ " 2013-06-10 10:37:58+00:00 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1311.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ " 453242.0 | \n",
+ " 228945.0 | \n",
+ " 3248965.5 | \n",
+ " 6.0 | \n",
+ " 1.0 | \n",
+ " 3698.198229 | \n",
+ " 5.221840 | \n",
+ " 3692.976389 | \n",
+ " ... | \n",
+ " 1.956087 | \n",
+ " 8821221.5 | \n",
+ " 641472.0 | \n",
+ " 2013-06-10 10:37:58+00:00 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1311.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 5.0 | \n",
+ " 201750.0 | \n",
+ " 107110.0 | \n",
+ " 1459190.0 | \n",
+ " 6.0 | \n",
+ " 1.0 | \n",
+ " 3803.369792 | \n",
+ " 0.146331 | \n",
+ " 3803.223461 | \n",
+ " ... | \n",
+ " 1.956087 | \n",
+ " 8821221.5 | \n",
+ " 641472.0 | \n",
+ " 2013-06-10 10:37:58+00:00 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1311.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 6.0 | \n",
+ " 217356.0 | \n",
+ " 111786.0 | \n",
+ " 1435871.5 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 2502.715509 | \n",
+ " 1408.715532 | \n",
+ " 1093.999977 | \n",
+ " ... | \n",
+ " 1.956087 | \n",
+ " 8821221.5 | \n",
+ " 641472.0 | \n",
+ " 2013-06-10 10:37:58+00:00 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1311.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 2.0 | \n",
+ " 143.0 | \n",
+ " 143.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 2041.274549 | \n",
+ " 1340.308160 | \n",
+ " 700.966389 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 0.0 | \n",
+ " 307.0 | \n",
+ " 2018-04-07 12:55:07+00:00 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1311.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 156291 | \n",
+ " 1256133 | \n",
+ " 5.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 33.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.110521 | \n",
+ " 0.110521 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 156292 | \n",
+ " 1256134 | \n",
+ " 5.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 44.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.092095 | \n",
+ " 0.092095 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 156293 | \n",
+ " 1256135 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 11.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.087894 | \n",
+ " 0.087894 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 156294 | \n",
+ " 1256136 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 22.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.040394 | \n",
+ " 0.040394 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 156295 | \n",
+ " 1256137 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 22.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
156296 rows × 40 columns
\n",
+ "
"
+ ],
"text/plain": [
- "Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',\n",
- " 'total_amount', 'nb_suppliers', 'vente_internet_max',\n",
- " 'purchase_date_min', 'purchase_date_max', 'time_between_purchase',\n",
- " 'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',\n",
- " 'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',\n",
- " 'structure_id', 'profession', 'language', 'mcp_contact_id',\n",
- " 'last_buying_date', 'max_price', 'ticket_sum', 'average_price',\n",
- " 'fidelity', 'average_purchase_delay', 'average_price_basket',\n",
- " 'average_ticket_basket', 'total_price', 'purchase_count',\n",
- " 'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',\n",
- " 'nb_campaigns_opened', 'time_to_open'],\n",
- " dtype='object')"
+ " customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
+ "0 1 2.0 384226.0 194790.0 2686540.5 \n",
+ "1 1 4.0 453242.0 228945.0 3248965.5 \n",
+ "2 1 5.0 201750.0 107110.0 1459190.0 \n",
+ "3 1 6.0 217356.0 111786.0 1435871.5 \n",
+ "4 2 2.0 143.0 143.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "156291 1256133 5.0 3.0 1.0 33.0 \n",
+ "156292 1256134 5.0 4.0 1.0 44.0 \n",
+ "156293 1256135 5.0 1.0 1.0 11.0 \n",
+ "156294 1256136 5.0 2.0 1.0 22.0 \n",
+ "156295 1256137 5.0 2.0 1.0 22.0 \n",
+ "\n",
+ " nb_suppliers vente_internet_max purchase_date_min \\\n",
+ "0 7.0 1.0 3262.190868 \n",
+ "1 6.0 1.0 3698.198229 \n",
+ "2 6.0 1.0 3803.369792 \n",
+ "3 5.0 1.0 2502.715509 \n",
+ "4 1.0 0.0 2041.274549 \n",
+ "... ... ... ... \n",
+ "156291 1.0 1.0 0.110521 \n",
+ "156292 1.0 1.0 0.092095 \n",
+ "156293 1.0 1.0 0.087894 \n",
+ "156294 1.0 1.0 0.040394 \n",
+ "156295 1.0 1.0 0.000000 \n",
+ "\n",
+ " purchase_date_max time_between_purchase ... average_ticket_basket \\\n",
+ "0 4.179306 3258.011562 ... 1.956087 \n",
+ "1 5.221840 3692.976389 ... 1.956087 \n",
+ "2 0.146331 3803.223461 ... 1.956087 \n",
+ "3 1408.715532 1093.999977 ... 1.956087 \n",
+ "4 1340.308160 700.966389 ... 1.000000 \n",
+ "... ... ... ... ... \n",
+ "156291 0.110521 0.000000 ... NaN \n",
+ "156292 0.092095 0.000000 ... NaN \n",
+ "156293 0.087894 0.000000 ... NaN \n",
+ "156294 0.040394 0.000000 ... NaN \n",
+ "156295 0.000000 0.000000 ... NaN \n",
+ "\n",
+ " total_price purchase_count first_buying_date country age \\\n",
+ "0 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
+ "1 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
+ "2 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
+ "3 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n",
+ "4 0.0 307.0 2018-04-07 12:55:07+00:00 fr NaN \n",
+ "... ... ... ... ... .. \n",
+ "156291 NaN NaN NaT NaN NaN \n",
+ "156292 NaN NaN NaT NaN NaN \n",
+ "156293 NaN NaN NaT NaN NaN \n",
+ "156294 NaN NaN NaT NaN NaN \n",
+ "156295 NaN NaN NaT NaN NaN \n",
+ "\n",
+ " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n",
+ "0 1311.0 0.0 0.0 NaT \n",
+ "1 1311.0 0.0 0.0 NaT \n",
+ "2 1311.0 0.0 0.0 NaT \n",
+ "3 1311.0 0.0 0.0 NaT \n",
+ "4 1311.0 4.0 0.0 NaT \n",
+ "... ... ... ... ... \n",
+ "156291 NaN NaN NaN NaT \n",
+ "156292 NaN NaN NaN NaT \n",
+ "156293 NaN NaN NaN NaT \n",
+ "156294 NaN NaN NaN NaT \n",
+ "156295 NaN NaN NaN NaT \n",
+ "\n",
+ "[156296 rows x 40 columns]"
]
},
- "execution_count": 66,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@@ -2745,7 +3130,7 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 42,
"id": "ebf6d843-dcc0-4e83-b063-94806c0bac17",
"metadata": {},
"outputs": [],
@@ -2778,7 +3163,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.13"
+ "version": "3.11.6"
}
},
"nbformat": 4,
diff --git a/0_Cleaning_and_merge.py b/0_Cleaning_and_merge.py
index f461547..55fd043 100644
--- a/0_Cleaning_and_merge.py
+++ b/0_Cleaning_and_merge.py
@@ -34,6 +34,8 @@ for i in range(len(liste_database)) :
nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
globals()[nom_dataframe] = df
+## 1 - Cleaning of the datasets
+
# Cleaning customerplus
df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
@@ -61,29 +63,91 @@ df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_
# Selection des variables d'intérêts
df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
-# Fusion de l'ensemble et creation des KPI
-df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information)
+## 2 - Construction of KPIs on a given period
-df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)
+def explanatory_variables(min_date = "2021-09-01", max_date = "2023-09-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
-# Fusion avec KPI liés au customer
-df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')
+ # Filtre de cohérence pour la mise en pratique de notre méthode
+ max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
+ min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
-# Fill NaN values
-df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+ #Filtre de la base df_campaigns_information
+ df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
+ df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
+
+ #Filtre de la base df_products_purchased_reduced
+ df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
-# Fusion avec KPI liés au comportement d'achat
-df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')
+ print("Data filtering : SUCCESS")
+
+ # Fusion de l'ensemble et creation des KPI
+ df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
+ df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
-# Fill NaN values
-df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+ print("KPIs construction : SUCCESS")
+ # Fusion avec KPI liés au customer
+ df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
+
+ # Fill NaN values
+ df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+
+ # Fusion avec KPI liés au comportement d'achat
+ df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
+
+ # Fill NaN values
+ df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+
+ return df_customer_product
+
+# Fonction pour créer les variables expliquée
+def explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced):
+
+ # Filtrer la base d'achat
+ df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
+
+ # Indicatrice d'achat
+ df_products_purchased_reduced['y_has_purchased'] = 1
+
+ y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
+
+ return y
## Exportation
-# Exportation vers 'projet-bdc2324-team1'
-BUCKET_OUT = "projet-bdc2324-team1"
-FILE_KEY_OUT_S3 = "1_Output/Company 1 - Segmentation base.csv"
+# Dossier d'exportation
+BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
+
+X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
+
+y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
+
+dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
+
+FILE_KEY_OUT_S3 = "dataset_test.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
- df1_customer_product.to_csv(file_out, index = False)
+ dataset_test.to_csv(file_out, index = False)
+
+print("Exportation dataset test : SUCCESS")
+
+X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
+
+y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
+
+dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
+
+FILE_KEY_OUT_S3 = "dataset_train.csv"
+FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
+
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+ dataset_test.to_csv(file_out, index = False)
+
+print("Exportation dataset train : SUCCESS")
+
+
+
+# # Exportation vers 'projet-bdc2324-team1'
+
+
+print("Exportation base de la base X d'entraînement : SUCCESS")
diff --git a/0_KPI_functions.py b/0_KPI_functions.py
index d79638a..69a5294 100644
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@@ -5,7 +5,7 @@ def campaigns_kpi_function(campaigns_information = None):
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture en min moyen
- campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']
+ campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert
@@ -57,7 +57,7 @@ def tickets_kpi_function(tickets_information = None):
tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
- tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
+ tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
'purchase_id_nunique' : 'nb_purchases',
'amount_sum' : 'total_amount',
'supplier_name_nunique' : 'nb_suppliers',