diff --git a/Spectacle/Stat_desc.ipynb b/Spectacle/Stat_desc.ipynb index 80a3be3..2fb2c0d 100644 --- a/Spectacle/Stat_desc.ipynb +++ b/Spectacle/Stat_desc.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 2, "id": "aa915888-cede-4eb0-8a26-7df573d29a3e", "metadata": {}, "outputs": [], @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "17949e81-c30b-4fdf-9872-d7dc2b22ba9e", "metadata": {}, "outputs": [], @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "9c1737a2-bad8-4266-8dec-452085d8cfe7", "metadata": {}, "outputs": [ @@ -59,7 +59,7 @@ " 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']" ] }, - "execution_count": 9, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "a35dc2f6-2017-4b21-abd2-2c4c112c96b2", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "40b705eb-fd18-436b-b150-61611a3c6a84", "metadata": {}, "outputs": [], @@ -109,7 +109,512 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 10, + "id": "c56decc3-de19-4786-82a4-1386c72a6bfb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_nametarget_type_is_importtarget_type_name
01165098618562Newsletter mensuelleFalsemanual_static_filter
11165100618559Newsletter mensuelleFalsemanual_static_filter
21165101618561Newsletter mensuelleFalsemanual_static_filter
31165102618560Newsletter mensuelleFalsemanual_static_filter
41165103618558Newsletter mensuelleFalsemanual_static_filter
..................
69253169815818580Newsletter mensuelleFalsemanual_static_filter
69254169815918569Newsletter mensuelleFalsemanual_static_filter
6925516981602962Newsletter mensuelleFalsemanual_static_filter
6925616981613825Newsletter mensuelleFalsemanual_static_filter
6925716981625731Newsletter mensuelleFalsemanual_static_filter
\n", + "

69258 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id target_name target_type_is_import \\\n", + "0 1165098 618562 Newsletter mensuelle False \n", + "1 1165100 618559 Newsletter mensuelle False \n", + "2 1165101 618561 Newsletter mensuelle False \n", + "3 1165102 618560 Newsletter mensuelle False \n", + "4 1165103 618558 Newsletter mensuelle False \n", + "... ... ... ... ... \n", + "69253 1698158 18580 Newsletter mensuelle False \n", + "69254 1698159 18569 Newsletter mensuelle False \n", + "69255 1698160 2962 Newsletter mensuelle False \n", + "69256 1698161 3825 Newsletter mensuelle False \n", + "69257 1698162 5731 Newsletter mensuelle False \n", + "\n", + " target_type_name \n", + "0 manual_static_filter \n", + "1 manual_static_filter \n", + "2 manual_static_filter \n", + "3 manual_static_filter \n", + "4 manual_static_filter \n", + "... ... \n", + "69253 manual_static_filter \n", + "69254 manual_static_filter \n", + "69255 manual_static_filter \n", + "69256 manual_static_filter \n", + "69257 manual_static_filter \n", + "\n", + "[69258 rows x 5 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_information" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c825d64b-356c-4b71-aa3c-90e0dd7ca092", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idcustomer_idpurchase_idevent_type_idsupplier_namepurchase_dateamountis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasonsstart_date_timeend_date_timeopen
01799177369844096132guichet2016-04-28 17:58:26+02:009.0Falsedansele grand tabo t gourmand jeunearinga rossatest 2016/20172016-09-27 00:00:00+02:001901-01-01 00:09:21+00:09True
11799178369844096133guichet2016-04-28 17:58:26+02:009.0Falsecirquele grand tabo t gourmand jeune5èmes hurlantstest 2016/20172016-11-18 00:00:00+01:001901-01-01 00:09:21+00:09True
21799179369844096131guichet2016-04-28 17:58:26+02:009.0Falsethéâtrele grand tabo t gourmand jeunedom juantest 2016/20172016-12-07 00:00:00+01:001901-01-01 00:09:21+00:09True
31799180369844096131guichet2016-04-28 17:58:26+02:009.0Falsethéâtrele grand tabo t gourmand jeunevanishing pointtest 2016/20172017-01-04 00:00:00+01:001901-01-01 00:09:21+00:09True
41799181369844096133guichet2016-04-28 17:58:26+02:0012.0Falsecirquela cite des congresabo t gourmand jeunea o lang photest 2016/20172017-01-03 00:00:00+01:001901-01-01 00:09:21+00:09True
...................................................
49230932522326217167100621guichet2023-03-09 12:08:45+01:007.0Falsethéâtrecap norttarif sco co 1 seance scolairesur moi, le temps2022/20232023-03-13 14:00:00+01:001901-01-01 00:09:21+00:09True
49231032522336217167100621guichet2023-03-09 12:08:45+01:007.0Falsethéâtrecap norttarif sco co 1 seance scolairesur moi, le temps2022/20232023-03-13 14:00:00+01:001901-01-01 00:09:21+00:09True
49231132522346217167100621guichet2023-03-09 12:08:45+01:007.0Falsethéâtrecap norttarif sco co 1 seance scolairesur moi, le temps2022/20232023-03-13 14:00:00+01:001901-01-01 00:09:21+00:09True
49231232522356217167100621guichet2023-03-09 12:08:45+01:007.0Falsethéâtrecap norttarif sco co 1 seance scolairesur moi, le temps2022/20232023-03-13 14:00:00+01:001901-01-01 00:09:21+00:09True
49231332522366217167100621guichet2023-03-09 12:08:45+01:007.0Falsethéâtrecap norttarif sco co 1 seance scolairesur moi, le temps2022/20232023-03-13 14:00:00+01:001901-01-01 00:09:21+00:09True
\n", + "

492314 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id customer_id purchase_id event_type_id supplier_name \\\n", + "0 1799177 36984 409613 2 guichet \n", + "1 1799178 36984 409613 3 guichet \n", + "2 1799179 36984 409613 1 guichet \n", + "3 1799180 36984 409613 1 guichet \n", + "4 1799181 36984 409613 3 guichet \n", + "... ... ... ... ... ... \n", + "492309 3252232 621716 710062 1 guichet \n", + "492310 3252233 621716 710062 1 guichet \n", + "492311 3252234 621716 710062 1 guichet \n", + "492312 3252235 621716 710062 1 guichet \n", + "492313 3252236 621716 710062 1 guichet \n", + "\n", + " purchase_date amount is_full_price name_event_types \\\n", + "0 2016-04-28 17:58:26+02:00 9.0 False danse \n", + "1 2016-04-28 17:58:26+02:00 9.0 False cirque \n", + "2 2016-04-28 17:58:26+02:00 9.0 False théâtre \n", + "3 2016-04-28 17:58:26+02:00 9.0 False théâtre \n", + "4 2016-04-28 17:58:26+02:00 12.0 False cirque \n", + "... ... ... ... ... \n", + "492309 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492310 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492311 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492312 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492313 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "\n", + " name_facilities name_categories \\\n", + "0 le grand t abo t gourmand jeune \n", + "1 le grand t abo t gourmand jeune \n", + "2 le grand t abo t gourmand jeune \n", + "3 le grand t abo t gourmand jeune \n", + "4 la cite des congres abo t gourmand jeune \n", + "... ... ... \n", + "492309 cap nort tarif sco co 1 seance scolaire \n", + "492310 cap nort tarif sco co 1 seance scolaire \n", + "492311 cap nort tarif sco co 1 seance scolaire \n", + "492312 cap nort tarif sco co 1 seance scolaire \n", + "492313 cap nort tarif sco co 1 seance scolaire \n", + "\n", + " name_events name_seasons start_date_time \\\n", + "0 aringa rossa test 2016/2017 2016-09-27 00:00:00+02:00 \n", + "1 5èmes hurlants test 2016/2017 2016-11-18 00:00:00+01:00 \n", + "2 dom juan test 2016/2017 2016-12-07 00:00:00+01:00 \n", + "3 vanishing point test 2016/2017 2017-01-04 00:00:00+01:00 \n", + "4 a o lang pho test 2016/2017 2017-01-03 00:00:00+01:00 \n", + "... ... ... ... \n", + "492309 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492310 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492311 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492312 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492313 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "\n", + " end_date_time open \n", + "0 1901-01-01 00:09:21+00:09 True \n", + "1 1901-01-01 00:09:21+00:09 True \n", + "2 1901-01-01 00:09:21+00:09 True \n", + "3 1901-01-01 00:09:21+00:09 True \n", + "4 1901-01-01 00:09:21+00:09 True \n", + "... ... ... \n", + "492309 1901-01-01 00:09:21+00:09 True \n", + "492310 1901-01-01 00:09:21+00:09 True \n", + "492311 1901-01-01 00:09:21+00:09 True \n", + "492312 1901-01-01 00:09:21+00:09 True \n", + "492313 1901-01-01 00:09:21+00:09 True \n", + "\n", + "[492314 rows x 16 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products_purchased_reduced" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "afd044b8-ac83-4a35-b959-700cae0b3b41", "metadata": {}, "outputs": [ @@ -124,7 +629,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -139,7 +644,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -154,7 +659,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -169,7 +674,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", @@ -181,21 +686,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", - " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n" ] }, @@ -203,7 +694,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -218,7 +711,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -233,7 +726,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", @@ -252,7 +745,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -267,7 +760,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -282,9 +775,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", - "/tmp/ipykernel_1173/2987234667.py:8: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_430/3170175140.py:10: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -299,7 +792,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", @@ -318,7 +811,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -333,7 +826,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -348,7 +841,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -363,7 +856,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", @@ -382,7 +875,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -397,7 +890,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -412,9 +905,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", - "/tmp/ipykernel_1173/2987234667.py:8: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_430/3170175140.py:10: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, @@ -429,7 +922,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1173/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_430/3170175140.py:10: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", @@ -642,164 +1135,20 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "id": "b9b6ec1f-36fb-4ee9-a1ed-09ff41878005", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atgenderis_email_true...purchase_countfirst_buying_datecountrygender_labelgender_femalegender_malegender_othercountry_frhas_tagsnumber_compagny
748961139NaNNaN0875FalseNaN2False...19NaNNaNother001NaN010
530061312707NaNNaN01556FalseNaN0True...102018-12-26 13:06:49+00:00frfemale1001.0011
14204212772NaN3954.02862FalseNaN2True...22016-09-09 17:02:00+00:00frother0011.0014
\n", - "

3 rows × 29 columns

\n", - "
" - ], - "text/plain": [ - " customer_id street_id structure_id mcp_contact_id fidelity \\\n", - "74896 1 139 NaN NaN 0 \n", - "53006 1 312707 NaN NaN 0 \n", - "142042 1 2772 NaN 3954.0 2 \n", - "\n", - " tenant_id is_partner deleted_at gender is_email_true ... \\\n", - "74896 875 False NaN 2 False ... \n", - "53006 1556 False NaN 0 True ... \n", - "142042 862 False NaN 2 True ... \n", - "\n", - " purchase_count first_buying_date country gender_label \\\n", - "74896 19 NaN NaN other \n", - "53006 10 2018-12-26 13:06:49+00:00 fr female \n", - "142042 2 2016-09-09 17:02:00+00:00 fr other \n", - "\n", - " gender_female gender_male gender_other country_fr has_tags \\\n", - "74896 0 0 1 NaN 0 \n", - "53006 1 0 0 1.0 0 \n", - "142042 0 0 1 1.0 0 \n", - "\n", - " number_compagny \n", - "74896 10 \n", - "53006 11 \n", - "142042 14 \n", - "\n", - "[3 rows x 29 columns]" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'customerplus_clean_spectacle' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcustomerplus_clean_spectacle\u001b[49m[customerplus_clean_spectacle[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcustomer_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;241m1\u001b[39m]\n", + "\u001b[0;31mNameError\u001b[0m: name 'customerplus_clean_spectacle' is not defined" + ] } ], "source": [ @@ -3465,7 +3814,7 @@ }, { "cell_type": "code", - "execution_count": 223, + "execution_count": 8, "id": "74534ded-8121-43fb-8cf8-af353bed2c77", "metadata": {}, "outputs": [ @@ -3493,7 +3842,7 @@ "dtype: int64" ] }, - "execution_count": 223, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -3651,6 +4000,20 @@ "products_purchased_reduced_spectacle.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a95b8f-6539-48bd-b09d-6f8f63d25fb2", + "metadata": {}, + "outputs": [], + "source": [ + "#nombre de ticket par compagnie\n", + "\n", + "company_campaigns_stats = campaigns_information_spectacle.groupby(\"number_compagny\")[[\"nb_campaigns\", \"nb_campaigns_opened\"]].sum().reset_index()\n", + "company_campaigns_stats[\"ratio_campaigns_opened\"] = company_campaigns_stats[\"nb_campaigns_opened\"] / company_campaigns_stats[\"nb_campaigns\"]\n", + "company_campaigns_stats" + ] + }, { "cell_type": "markdown", "id": "b9e84af4-a02b-4f83-81ae-b7a73475d060", @@ -3694,6 +4057,193 @@ "print(\"Nombre de lignes de la table : \",target_information_spectacle.shape[0])\n", "target_information_spectacle.isna().sum()" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "985b6403-3c75-420e-a4a4-d3045213e9ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_nametarget_type_is_importtarget_type_namenumber_compagny
01165098618562Newsletter mensuelleFalsemanual_static_filter10
11165100618559Newsletter mensuelleFalsemanual_static_filter10
21165101618561Newsletter mensuelleFalsemanual_static_filter10
31165102618560Newsletter mensuelleFalsemanual_static_filter10
41165103618558Newsletter mensuelleFalsemanual_static_filter10
.....................
77965342070826764876INSCRIPTION NL VOYAGES HUMAFalsemanual_static_filter14
77965442070836764877Inscriptions newsletters (depuis 2019)Falsemanual_static_filter14
77965542070846801322Inscriptions newsletters (depuis 2019)Falsemanual_static_filter14
77965642070856837768Inscriptions newsletters (depuis 2019)Falsemanual_static_filter14
77965742070866837769Inscriptions newsletters (depuis 2019)Falsemanual_static_filter14
\n", + "

6240166 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id target_name \\\n", + "0 1165098 618562 Newsletter mensuelle \n", + "1 1165100 618559 Newsletter mensuelle \n", + "2 1165101 618561 Newsletter mensuelle \n", + "3 1165102 618560 Newsletter mensuelle \n", + "4 1165103 618558 Newsletter mensuelle \n", + "... ... ... ... \n", + "779653 4207082 6764876 INSCRIPTION NL VOYAGES HUMA \n", + "779654 4207083 6764877 Inscriptions newsletters (depuis 2019) \n", + "779655 4207084 6801322 Inscriptions newsletters (depuis 2019) \n", + "779656 4207085 6837768 Inscriptions newsletters (depuis 2019) \n", + "779657 4207086 6837769 Inscriptions newsletters (depuis 2019) \n", + "\n", + " target_type_is_import target_type_name number_compagny \n", + "0 False manual_static_filter 10 \n", + "1 False manual_static_filter 10 \n", + "2 False manual_static_filter 10 \n", + "3 False manual_static_filter 10 \n", + "4 False manual_static_filter 10 \n", + "... ... ... ... \n", + "779653 False manual_static_filter 14 \n", + "779654 False manual_static_filter 14 \n", + "779655 False manual_static_filter 14 \n", + "779656 False manual_static_filter 14 \n", + "779657 False manual_static_filter 14 \n", + "\n", + "[6240166 rows x 6 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_information_spectacle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a180f0a-c6de-4e66-9ae8-fdbfdf8837c9", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/code_base_train_test.ipynb b/code_base_train_test.ipynb new file mode 100644 index 0000000..b7e6578 --- /dev/null +++ b/code_base_train_test.ipynb @@ -0,0 +1,814 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "bf34b03c-536f-4f93-93a5-e452552653aa", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", + "Couverture Company 10 : 2016-03-07 - 2023-09-25\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", + "Couverture Company 11 : 2015-06-26 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", + "Couverture Company 12 : 2016-06-14 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", + "Couverture Company 13 : 2010-07-31 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", + "Couverture Company 14 : 1901-01-01 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset test : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset train : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset test : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset train : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset test : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset train : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset test : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset train : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset test : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "Exportation dataset train : SUCCESS\n", + "FIN DE LA GENERATION DES DATASETS : SUCCESS\n" + ] + } + ], + "source": [ + "# Business Data Challenge - Team 1\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "import warnings\n", + "from datetime import date, timedelta, datetime\n", + "\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "\n", + "# Import KPI construction functions\n", + "exec(open('0_KPI_functions.py').read())\n", + "\n", + "# Ignore warning\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "def display_covering_time(df, company, datecover):\n", + " \"\"\"\n", + " This function draws the time coverage of each company\n", + " \"\"\"\n", + " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", + " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", + " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", + " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", + " return datecover\n", + "\n", + "\n", + "def compute_time_intersection(datecover):\n", + " \"\"\"\n", + " This function returns the time coverage for all companies\n", + " \"\"\"\n", + " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", + " intersection = set.intersection(*timestamps_sets)\n", + " intersection_list = list(intersection)\n", + " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", + " return sorted(formated_dates)\n", + "\n", + "\n", + "def df_coverage_modelization(sport, coverage_train = 0.7):\n", + " \"\"\"\n", + " This function returns start_date, end_of_features and final dates\n", + " that help to construct train and test datasets\n", + " \"\"\"\n", + " datecover = {}\n", + " for company in sport:\n", + " df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", + " datetime_col = ['purchase_date'])\n", + " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", + " #print(datecover.keys())\n", + " dt_coverage = compute_time_intersection(datecover)\n", + " start_date = dt_coverage[0]\n", + " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", + " final_date = dt_coverage[-1]\n", + " return start_date, end_of_features, final_date\n", + " \n", + "\n", + "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n", + " \n", + " # Import customerplus\n", + " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", + " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", + " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", + " \n", + " # Filtre de cohérence pour la mise en pratique de notre méthode\n", + " max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n", + " end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n", + " min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n", + "\n", + " #Filtre de la base df_campaigns_information\n", + " df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + " \n", + " #Filtre de la base df_products_purchased_reduced\n", + " df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", + "\n", + " print(\"Data filtering : SUCCESS\")\n", + " \n", + " # Fusion de l'ensemble et creation des KPI\n", + "\n", + " # KPI sur les campagnes publicitaires\n", + " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", + "\n", + " # KPI sur le comportement d'achat\n", + " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", + "\n", + " # KPI sur les données socio-démographiques\n", + " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", + " \n", + " print(\"KPIs construction : SUCCESS\")\n", + " \n", + " # Fusion avec KPI liés au customer\n", + " df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n", + " \n", + " # Fill NaN values\n", + " df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n", + " \n", + " # Fusion avec KPI liés au comportement d'achat\n", + " df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n", + " \n", + " # Fill NaN values\n", + " df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n", + "\n", + " print(\"Explanatory variable construction : SUCCESS\")\n", + "\n", + " # 2. Construction of the explained variable \n", + " df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n", + "\n", + " # Indicatrice d'achat\n", + " df_products_purchased_to_predict['y_has_purchased'] = 1\n", + "\n", + " y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n", + "\n", + " print(\"Explained variable construction : SUCCESS\")\n", + " \n", + " # 3. Merge between explained and explanatory variables\n", + " dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n", + "\n", + " # 0 if there is no purchase\n", + " dataset[['y_has_purchased']].fillna(0)\n", + "\n", + " # add id_company prefix to customer_id\n", + " dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n", + " \n", + " return dataset\n", + "\n", + "## Exportation\n", + "\n", + "companies = {'musee' : ['1', '2', '3', '4', '101'],\n", + " 'sport': ['5', '6', '7', '8', '9'],\n", + " 'musique' : ['10', '11', '12', '13', '14']}\n", + "\n", + "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n", + "list_of_comp = companies[type_of_comp] \n", + "# Dossier d'exportation\n", + "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n", + "\n", + "# Create test dataset and train dataset for sport companies\n", + "\n", + "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n", + "\n", + "for company in list_of_comp:\n", + " dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n", + " max_date = final_date, directory_path = company) \n", + "\n", + " # Exportation\n", + " FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n", + " FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n", + " \n", + " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", + " dataset_test.to_csv(file_out, index = False)\n", + " \n", + " print(\"Exportation dataset test : SUCCESS\")\n", + "\n", + "# Dataset train\n", + " dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n", + " max_date = final_date, directory_path = company)\n", + " # Export\n", + " FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n", + " FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n", + " \n", + " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", + " dataset_train.to_csv(file_out, index = False)\n", + " \n", + " print(\"Exportation dataset train : SUCCESS\")\n", + "\n", + "\n", + "print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3721427e-5957-4556-b278-2e7ffca892f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FILE_PATH_OUT_S3" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_id 0.000000\n", + "nb_tickets 0.000000\n", + "nb_purchases 0.000000\n", + "total_amount 0.000000\n", + "nb_suppliers 0.000000\n", + "vente_internet_max 0.000000\n", + "purchase_date_min 0.858950\n", + "purchase_date_max 0.858950\n", + "time_between_purchase 0.858950\n", + "nb_tickets_internet 0.000000\n", + "street_id 0.000000\n", + "structure_id 0.869838\n", + "mcp_contact_id 0.276677\n", + "fidelity 0.000000\n", + "tenant_id 0.000000\n", + "is_partner 0.000000\n", + "deleted_at 1.000000\n", + "gender 0.000000\n", + "is_email_true 0.000000\n", + "opt_in 0.000000\n", + "last_buying_date 0.709626\n", + "max_price 0.709626\n", + "ticket_sum 0.000000\n", + "average_price 0.709626\n", + "average_purchase_delay 0.709731\n", + "average_price_basket 0.709731\n", + "average_ticket_basket 0.709731\n", + "total_price 0.000000\n", + "purchase_count 0.000000\n", + "first_buying_date 0.709626\n", + "country 0.152090\n", + "gender_label 0.000000\n", + "gender_female 0.000000\n", + "gender_male 0.000000\n", + "gender_other 0.000000\n", + "country_fr 0.152090\n", + "has_tags 0.000000\n", + "nb_campaigns 0.000000\n", + "nb_campaigns_opened 0.000000\n", + "time_to_open 0.848079\n", + "y_has_purchased 1.000000\n", + "dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " dataset_train.isna().sum()/dataset_train.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a4c4e994-231b-4467-aa1b-0a5283c59dd5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idnb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internet...gender_labelgender_femalegender_malegender_othercountry_frhas_tagsnb_campaignsnb_campaigns_openedtime_to_openy_has_purchased
014_12.02.070.01.01.01464.9384491464.9380210.0004282.0...other0011.009.01.00 days 00:36:13NaN
114_27.03.0145.02.01.01466.030116365.3350001100.6951167.0...male0101.019.04.00 days 02:30:09.250000NaN
214_32.02.070.01.01.01476.9078941476.9076620.0002312.0...female1001.006.01.00 days 20:58:45NaN
314_42.02.032.01.01.01465.9078941465.9074650.0004282.0...male0101.006.00.0NaTNaN
414_52.02.070.01.01.01465.3738661465.3738190.0000462.0...female1001.007.00.0NaTNaN
..................................................................
34312114_68847480.00.00.00.00.0NaNNaNNaN0.0...male0101.000.00.0NaTNaN
34312214_68847490.00.00.00.00.0NaNNaNNaN0.0...male0101.000.00.0NaTNaN
34312314_68847500.00.00.00.00.0NaNNaNNaN0.0...male0101.000.00.0NaTNaN
34312414_68847510.00.00.00.00.0NaNNaNNaN0.0...female1001.000.00.0NaTNaN
34312514_68847530.00.00.00.00.0NaNNaNNaN0.0...male0101.000.00.0NaTNaN
\n", + "

343126 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " customer_id nb_tickets nb_purchases total_amount nb_suppliers \\\n", + "0 14_1 2.0 2.0 70.0 1.0 \n", + "1 14_2 7.0 3.0 145.0 2.0 \n", + "2 14_3 2.0 2.0 70.0 1.0 \n", + "3 14_4 2.0 2.0 32.0 1.0 \n", + "4 14_5 2.0 2.0 70.0 1.0 \n", + "... ... ... ... ... ... \n", + "343121 14_6884748 0.0 0.0 0.0 0.0 \n", + "343122 14_6884749 0.0 0.0 0.0 0.0 \n", + "343123 14_6884750 0.0 0.0 0.0 0.0 \n", + "343124 14_6884751 0.0 0.0 0.0 0.0 \n", + "343125 14_6884753 0.0 0.0 0.0 0.0 \n", + "\n", + " vente_internet_max purchase_date_min purchase_date_max \\\n", + "0 1.0 1464.938449 1464.938021 \n", + "1 1.0 1466.030116 365.335000 \n", + "2 1.0 1476.907894 1476.907662 \n", + "3 1.0 1465.907894 1465.907465 \n", + "4 1.0 1465.373866 1465.373819 \n", + "... ... ... ... \n", + "343121 0.0 NaN NaN \n", + "343122 0.0 NaN NaN \n", + "343123 0.0 NaN NaN \n", + "343124 0.0 NaN NaN \n", + "343125 0.0 NaN NaN \n", + "\n", + " time_between_purchase nb_tickets_internet ... gender_label \\\n", + "0 0.000428 2.0 ... other \n", + "1 1100.695116 7.0 ... male \n", + "2 0.000231 2.0 ... female \n", + "3 0.000428 2.0 ... male \n", + "4 0.000046 2.0 ... female \n", + "... ... ... ... ... \n", + "343121 NaN 0.0 ... male \n", + "343122 NaN 0.0 ... male \n", + "343123 NaN 0.0 ... male \n", + "343124 NaN 0.0 ... female \n", + "343125 NaN 0.0 ... male \n", + "\n", + " gender_female gender_male gender_other country_fr has_tags \\\n", + "0 0 0 1 1.0 0 \n", + "1 0 1 0 1.0 1 \n", + "2 1 0 0 1.0 0 \n", + "3 0 1 0 1.0 0 \n", + "4 1 0 0 1.0 0 \n", + "... ... ... ... ... ... \n", + "343121 0 1 0 1.0 0 \n", + "343122 0 1 0 1.0 0 \n", + "343123 0 1 0 1.0 0 \n", + "343124 1 0 0 1.0 0 \n", + "343125 0 1 0 1.0 0 \n", + "\n", + " nb_campaigns nb_campaigns_opened time_to_open \\\n", + "0 9.0 1.0 0 days 00:36:13 \n", + "1 9.0 4.0 0 days 02:30:09.250000 \n", + "2 6.0 1.0 0 days 20:58:45 \n", + "3 6.0 0.0 NaT \n", + "4 7.0 0.0 NaT \n", + "... ... ... ... \n", + "343121 0.0 0.0 NaT \n", + "343122 0.0 0.0 NaT \n", + "343123 0.0 0.0 NaT \n", + "343124 0.0 0.0 NaT \n", + "343125 0.0 0.0 NaT \n", + "\n", + " y_has_purchased \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "343121 NaN \n", + "343122 NaN \n", + "343123 NaN \n", + "343124 NaN \n", + "343125 NaN \n", + "\n", + "[343126 rows x 41 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75f9a672-641f-49a2-a8d6-7673845506f5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}