From 12427e7b18ea9e8acaf6644d20bacd681fe1b212 Mon Sep 17 00:00:00 2001 From: frodrigue-ensae Date: Wed, 28 Feb 2024 05:51:50 +0000 Subject: [PATCH] stat --- Spectacle/Exploration_spectacle.ipynb | 1392 +++++++++++++++++++------ 1 file changed, 1089 insertions(+), 303 deletions(-) diff --git a/Spectacle/Exploration_spectacle.ipynb b/Spectacle/Exploration_spectacle.ipynb index 6324287..1e42d16 100644 --- a/Spectacle/Exploration_spectacle.ipynb +++ b/Spectacle/Exploration_spectacle.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 70, "id": "cca62d72-f809-41a9-bb06-1be7d6b09307", "metadata": {}, "outputs": [ @@ -42,7 +42,7 @@ " 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']" ] }, - "execution_count": 42, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 71, "id": "68fb54f3-8eb3-4cd0-966b-000876912fb5", "metadata": {}, "outputs": [ @@ -83,149 +83,119 @@ " \n", " \n", " \n", + " ticket_id\n", " customer_id\n", - " street_id\n", - " structure_id\n", - " mcp_contact_id\n", - " fidelity\n", - " tenant_id\n", - " is_partner\n", - " deleted_at\n", - " gender\n", - " is_email_true\n", - " ...\n", - " max_price\n", - " ticket_sum\n", - " average_price\n", - " average_purchase_delay\n", - " average_price_basket\n", - " average_ticket_basket\n", - " total_price\n", - " purchase_count\n", - " first_buying_date\n", - " country\n", + " purchase_id\n", + " event_type_id\n", + " supplier_name\n", + " purchase_date\n", + " amount\n", + " is_full_price\n", + " name_event_types\n", + " name_facilities\n", + " name_categories\n", + " name_events\n", + " name_seasons\n", + " start_date_time\n", + " end_date_time\n", + " open\n", " \n", " \n", " \n", " \n", " 0\n", - " 821538\n", - " 139\n", - " NaN\n", - " NaN\n", - " 0\n", - " 875\n", - " False\n", - " NaN\n", + " 1799177\n", + " 36984\n", + " 409613\n", " 2\n", + " guichet\n", + " 2016-04-28 17:58:26+02:00\n", + " 9.0\n", + " False\n", + " danse\n", + " le grand t\n", + " abo t gourmand jeune\n", + " aringa rossa\n", + " test 2016/2017\n", + " 2016-09-27 00:00:00+02:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", " 1\n", - " 809126\n", - " 1063\n", - " NaN\n", - " NaN\n", - " 0\n", - " 875\n", + " 1799178\n", + " 36984\n", + " 409613\n", + " 3\n", + " guichet\n", + " 2016-04-28 17:58:26+02:00\n", + " 9.0\n", " False\n", - " NaN\n", - " 2\n", + " cirque\n", + " le grand t\n", + " abo t gourmand jeune\n", + " 5èmes hurlants\n", + " test 2016/2017\n", + " 2016-11-18 00:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " fr\n", " \n", " \n", " 2\n", - " 11005\n", - " 1063\n", - " NaN\n", - " NaN\n", - " 0\n", - " 875\n", + " 1799179\n", + " 36984\n", + " 409613\n", + " 1\n", + " guichet\n", + " 2016-04-28 17:58:26+02:00\n", + " 9.0\n", " False\n", - " NaN\n", - " 2\n", - " False\n", - " ...\n", - " NaN\n", - " 0\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 14\n", - " NaN\n", - " fr\n", + " théâtre\n", + " le grand t\n", + " abo t gourmand jeune\n", + " dom juan\n", + " test 2016/2017\n", + " 2016-12-07 00:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", + " True\n", " \n", " \n", " 3\n", - " 17663\n", - " 12731\n", - " NaN\n", - " NaN\n", - " 0\n", - " 875\n", - " False\n", - " NaN\n", - " 0\n", - " False\n", - " ...\n", - " NaN\n", - " 0\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1799180\n", + " 36984\n", + " 409613\n", " 1\n", - " NaN\n", - " fr\n", + " guichet\n", + " 2016-04-28 17:58:26+02:00\n", + " 9.0\n", + " False\n", + " théâtre\n", + " le grand t\n", + " abo t gourmand jeune\n", + " vanishing point\n", + " test 2016/2017\n", + " 2017-01-04 00:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", + " True\n", " \n", " \n", " 4\n", - " 38100\n", - " 12395\n", - " NaN\n", - " NaN\n", - " 0\n", - " 875\n", + " 1799181\n", + " 36984\n", + " 409613\n", + " 3\n", + " guichet\n", + " 2016-04-28 17:58:26+02:00\n", + " 12.0\n", " False\n", - " NaN\n", - " 0\n", + " cirque\n", + " la cite des congres\n", + " abo t gourmand jeune\n", + " a o lang pho\n", + " test 2016/2017\n", + " 2017-01-03 00:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1\n", - " NaN\n", - " fr\n", " \n", " \n", " ...\n", @@ -245,213 +215,183 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", - " 98789\n", - " 766266\n", - " 139\n", - " NaN\n", - " 181304.0\n", - " 0\n", - " 875\n", + " 492309\n", + " 3252232\n", + " 621716\n", + " 710062\n", + " 1\n", + " guichet\n", + " 2023-03-09 12:08:45+01:00\n", + " 7.0\n", " False\n", - " NaN\n", - " 2\n", + " théâtre\n", + " cap nort\n", + " tarif sco co 1 seance scolaire\n", + " sur moi, le temps\n", + " 2022/2023\n", + " 2023-03-13 14:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", - " 98790\n", - " 766336\n", - " 139\n", - " NaN\n", - " 178189.0\n", - " 0\n", - " 875\n", + " 492310\n", + " 3252233\n", + " 621716\n", + " 710062\n", + " 1\n", + " guichet\n", + " 2023-03-09 12:08:45+01:00\n", + " 7.0\n", " False\n", - " NaN\n", - " 2\n", + " théâtre\n", + " cap nort\n", + " tarif sco co 1 seance scolaire\n", + " sur moi, le temps\n", + " 2022/2023\n", + " 2023-03-13 14:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", - " 98791\n", - " 766348\n", - " 139\n", - " NaN\n", - " 178141.0\n", - " 0\n", - " 875\n", + " 492311\n", + " 3252234\n", + " 621716\n", + " 710062\n", + " 1\n", + " guichet\n", + " 2023-03-09 12:08:45+01:00\n", + " 7.0\n", " False\n", - " NaN\n", - " 2\n", + " théâtre\n", + " cap nort\n", + " tarif sco co 1 seance scolaire\n", + " sur moi, le temps\n", + " 2022/2023\n", + " 2023-03-13 14:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", - " 98792\n", - " 766363\n", - " 139\n", - " NaN\n", - " 176807.0\n", - " 0\n", - " 875\n", + " 492312\n", + " 3252235\n", + " 621716\n", + " 710062\n", + " 1\n", + " guichet\n", + " 2023-03-09 12:08:45+01:00\n", + " 7.0\n", " False\n", - " NaN\n", - " 2\n", + " théâtre\n", + " cap nort\n", + " tarif sco co 1 seance scolaire\n", + " sur moi, le temps\n", + " 2022/2023\n", + " 2023-03-13 14:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", - " 98793\n", - " 766366\n", - " 139\n", - " NaN\n", - " 176788.0\n", - " 0\n", - " 875\n", + " 492313\n", + " 3252236\n", + " 621716\n", + " 710062\n", + " 1\n", + " guichet\n", + " 2023-03-09 12:08:45+01:00\n", + " 7.0\n", " False\n", - " NaN\n", - " 2\n", + " théâtre\n", + " cap nort\n", + " tarif sco co 1 seance scolaire\n", + " sur moi, le temps\n", + " 2022/2023\n", + " 2023-03-13 14:00:00+01:00\n", + " 1901-01-01 00:09:21+00:09\n", " True\n", - " ...\n", - " NaN\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0\n", - " NaN\n", - " NaN\n", " \n", " \n", "\n", - "

98794 rows × 22 columns

\n", + "

492314 rows × 16 columns

\n", "" ], "text/plain": [ - " customer_id street_id structure_id mcp_contact_id fidelity \\\n", - "0 821538 139 NaN NaN 0 \n", - "1 809126 1063 NaN NaN 0 \n", - "2 11005 1063 NaN NaN 0 \n", - "3 17663 12731 NaN NaN 0 \n", - "4 38100 12395 NaN NaN 0 \n", - "... ... ... ... ... ... \n", - "98789 766266 139 NaN 181304.0 0 \n", - "98790 766336 139 NaN 178189.0 0 \n", - "98791 766348 139 NaN 178141.0 0 \n", - "98792 766363 139 NaN 176807.0 0 \n", - "98793 766366 139 NaN 176788.0 0 \n", + " ticket_id customer_id purchase_id event_type_id supplier_name \\\n", + "0 1799177 36984 409613 2 guichet \n", + "1 1799178 36984 409613 3 guichet \n", + "2 1799179 36984 409613 1 guichet \n", + "3 1799180 36984 409613 1 guichet \n", + "4 1799181 36984 409613 3 guichet \n", + "... ... ... ... ... ... \n", + "492309 3252232 621716 710062 1 guichet \n", + "492310 3252233 621716 710062 1 guichet \n", + "492311 3252234 621716 710062 1 guichet \n", + "492312 3252235 621716 710062 1 guichet \n", + "492313 3252236 621716 710062 1 guichet \n", "\n", - " tenant_id is_partner deleted_at gender is_email_true ... \\\n", - "0 875 False NaN 2 True ... \n", - "1 875 False NaN 2 True ... \n", - "2 875 False NaN 2 False ... \n", - "3 875 False NaN 0 False ... \n", - "4 875 False NaN 0 True ... \n", - "... ... ... ... ... ... ... \n", - "98789 875 False NaN 2 True ... \n", - "98790 875 False NaN 2 True ... \n", - "98791 875 False NaN 2 True ... \n", - "98792 875 False NaN 2 True ... \n", - "98793 875 False NaN 2 True ... \n", + " purchase_date amount is_full_price name_event_types \\\n", + "0 2016-04-28 17:58:26+02:00 9.0 False danse \n", + "1 2016-04-28 17:58:26+02:00 9.0 False cirque \n", + "2 2016-04-28 17:58:26+02:00 9.0 False théâtre \n", + "3 2016-04-28 17:58:26+02:00 9.0 False théâtre \n", + "4 2016-04-28 17:58:26+02:00 12.0 False cirque \n", + "... ... ... ... ... \n", + "492309 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492310 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492311 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492312 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", + "492313 2023-03-09 12:08:45+01:00 7.0 False théâtre \n", "\n", - " max_price ticket_sum average_price average_purchase_delay \\\n", - "0 NaN 0 NaN NaN \n", - "1 NaN 0 NaN NaN \n", - "2 NaN 0 0.0 NaN \n", - "3 NaN 0 0.0 NaN \n", - "4 NaN 0 0.0 NaN \n", - "... ... ... ... ... \n", - "98789 NaN 0 NaN NaN \n", - "98790 NaN 0 NaN NaN \n", - "98791 NaN 0 NaN NaN \n", - "98792 NaN 0 NaN NaN \n", - "98793 NaN 0 NaN NaN \n", + " name_facilities name_categories \\\n", + "0 le grand t abo t gourmand jeune \n", + "1 le grand t abo t gourmand jeune \n", + "2 le grand t abo t gourmand jeune \n", + "3 le grand t abo t gourmand jeune \n", + "4 la cite des congres abo t gourmand jeune \n", + "... ... ... \n", + "492309 cap nort tarif sco co 1 seance scolaire \n", + "492310 cap nort tarif sco co 1 seance scolaire \n", + "492311 cap nort tarif sco co 1 seance scolaire \n", + "492312 cap nort tarif sco co 1 seance scolaire \n", + "492313 cap nort tarif sco co 1 seance scolaire \n", "\n", - " average_price_basket average_ticket_basket total_price \\\n", - "0 NaN NaN 0.0 \n", - "1 NaN NaN 0.0 \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "98789 NaN NaN 0.0 \n", - "98790 NaN NaN 0.0 \n", - "98791 NaN NaN 0.0 \n", - "98792 NaN NaN 0.0 \n", - "98793 NaN NaN 0.0 \n", + " name_events name_seasons start_date_time \\\n", + "0 aringa rossa test 2016/2017 2016-09-27 00:00:00+02:00 \n", + "1 5èmes hurlants test 2016/2017 2016-11-18 00:00:00+01:00 \n", + "2 dom juan test 2016/2017 2016-12-07 00:00:00+01:00 \n", + "3 vanishing point test 2016/2017 2017-01-04 00:00:00+01:00 \n", + "4 a o lang pho test 2016/2017 2017-01-03 00:00:00+01:00 \n", + "... ... ... ... \n", + "492309 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492310 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492311 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492312 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", + "492313 sur moi, le temps 2022/2023 2023-03-13 14:00:00+01:00 \n", "\n", - " purchase_count first_buying_date country \n", - "0 0 NaN NaN \n", - "1 0 NaN fr \n", - "2 14 NaN fr \n", - "3 1 NaN fr \n", - "4 1 NaN fr \n", - "... ... ... ... \n", - "98789 0 NaN NaN \n", - "98790 0 NaN NaN \n", - "98791 0 NaN NaN \n", - "98792 0 NaN NaN \n", - "98793 0 NaN NaN \n", + " end_date_time open \n", + "0 1901-01-01 00:09:21+00:09 True \n", + "1 1901-01-01 00:09:21+00:09 True \n", + "2 1901-01-01 00:09:21+00:09 True \n", + "3 1901-01-01 00:09:21+00:09 True \n", + "4 1901-01-01 00:09:21+00:09 True \n", + "... ... ... \n", + "492309 1901-01-01 00:09:21+00:09 True \n", + "492310 1901-01-01 00:09:21+00:09 True \n", + "492311 1901-01-01 00:09:21+00:09 True \n", + "492312 1901-01-01 00:09:21+00:09 True \n", + "492313 1901-01-01 00:09:21+00:09 True \n", "\n", - "[98794 rows x 22 columns]" + "[492314 rows x 16 columns]" ] }, - "execution_count": 49, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "customerplus_cleaned" + "products_purchased_reduced" ] }, { @@ -842,9 +782,52 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 73, "id": "df124880-1e4f-4eaf-b0ef-72bb4f840d45", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_id 0\n", + "nb_tickets 0\n", + "nb_purchases 0\n", + "total_amount 0\n", + "nb_suppliers 0\n", + "vente_internet_max 0\n", + "purchase_date_min 0\n", + "purchase_date_max 0\n", + "time_between_purchase 0\n", + "nb_tickets_internet 0\n", + "dtype: int64" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_tickets_kpi.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "7e2ab67d-1cf6-41de-804e-23c14e0be7d5", + "metadata": {}, + "outputs": [], + "source": [ + " # KPI sur le comportement d'achat\n", + " \n", + "df_tickets_kpi = tickets_kpi_function(tickets_information = purchases)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "7be68aa3-16de-4319-93d4-0c28258e3dd8", + "metadata": {}, "outputs": [ { "data": { @@ -1071,7 +1054,7 @@ "[26105 rows x 10 columns]" ] }, - "execution_count": 53, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -1084,7 +1067,9 @@ "cell_type": "code", "execution_count": 57, "id": "4e8c0d75-117f-4400-8d55-b3ae3f43501b", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -1468,6 +1453,55 @@ "df_customerplus_clean" ] }, + { + "cell_type": "code", + "execution_count": 72, + "id": "59e3a6f5-97e6-48c6-b3f8-4333a0d94eb5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_id 0\n", + "street_id 0\n", + "structure_id 96706\n", + "mcp_contact_id 19094\n", + "fidelity 0\n", + "tenant_id 0\n", + "is_partner 0\n", + "deleted_at 98794\n", + "gender 0\n", + "is_email_true 0\n", + "opt_in 0\n", + "last_buying_date 73081\n", + "max_price 73081\n", + "ticket_sum 0\n", + "average_price 35539\n", + "average_purchase_delay 73081\n", + "average_price_basket 73081\n", + "average_ticket_basket 73081\n", + "total_price 37542\n", + "purchase_count 0\n", + "first_buying_date 73081\n", + "country 44192\n", + "gender_label 0\n", + "gender_female 0\n", + "gender_male 0\n", + "gender_other 0\n", + "country_fr 44192\n", + "has_tags 0\n", + "dtype: int64" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_customerplus_clean.isna().sum()" + ] + }, { "cell_type": "code", "execution_count": 29, @@ -1511,7 +1545,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 65, "id": "17b89ca1-deea-4139-a6c0-7822cc4e7a90", "metadata": {}, "outputs": [ @@ -1667,7 +1701,7 @@ "[69258 rows x 5 columns]" ] }, - "execution_count": 33, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -1678,27 +1712,779 @@ }, { "cell_type": "code", - "execution_count": 40, - "id": "c90d94ab-cf0e-4d18-9d5e-cb1d22f4d58b", + "execution_count": 64, + "id": "27a3c2bf-0541-43b4-b62d-4621692f6c66", + "metadata": {}, + "outputs": [], + "source": [ + "pd.reset_option('display.max_rows',70000)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "51e57220-021f-4b0f-a2c9-360d612c9f75", "metadata": {}, "outputs": [ { - "ename": "SyntaxError", - "evalue": "f-string: expecting '}' (1665996669.py, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[40], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{'musee'}'\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m f-string: expecting '}'\n" + "data": { + "text/plain": [ + "0 Newsletter mensuelle\n", + "1 Newsletter mensuelle\n", + "2 Newsletter mensuelle\n", + "3 Newsletter mensuelle\n", + "4 Newsletter mensuelle\n", + " ... \n", + "9995 Newsletter mensuelle\n", + "9996 Newsletter mensuelle\n", + "9997 Newsletter mensuelle\n", + "9998 Newsletter mensuelle\n", + "9999 Newsletter mensuelle\n", + "Name: target_name, Length: 10000, dtype: object" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "targets[\"target_name\"].head(10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "db3748e6-795e-459c-86dd-3389455af217", + "metadata": {}, + "outputs": [], + "source": [ + "companies = {'musee' : ['1', '2', '3', '4', '101'],\n", + " 'sport': ['5', '6', '7', '8', '9'],\n", + " 'musique' : ['10', '11', '12', '13', '14']}" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "d6767ba6-94ef-43f9-8f67-15ecdb41a70b", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n" ] } ], "source": [ - "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{'musee'}'" + "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n", + "list_of_comp = companies[type_of_comp] \n" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "050963aa-5cdc-4ff2-a380-16efec89adf0", + "metadata": {}, + "outputs": [], + "source": [ + "# Dossier d'exportation\n", + "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "21a32b69-de53-45ce-9e31-22c45c223924", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projet-bdc2324-team1/Generalization/musique'" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BUCKET_OUT" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "177c4742-5ec6-4326-b984-09e673791801", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projet-bdc2324-team1/Generalization/musique'" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'projet-bdc2324-team1/Generalization/musique'" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "80c6d397-117e-493d-ab0f-7698dbfa8cc4", + "metadata": {}, + "outputs": [], + "source": [ + "def display_covering_time(df, company, datecover):\n", + " \"\"\"\n", + " This function draws the time coverage of each company\n", + " \"\"\"\n", + " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", + " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", + " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", + " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", + " return datecover\n", + "\n", + "\n", + "def compute_time_intersection(datecover):\n", + " \"\"\"\n", + " This function returns the time coverage for all companies\n", + " \"\"\"\n", + " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", + " intersection = set.intersection(*timestamps_sets)\n", + " intersection_list = list(intersection)\n", + " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", + " return sorted(formated_dates)\n", + "\n", + "\n", + "def df_coverage_modelization(sport, coverage_train = 0.7):\n", + " \"\"\"\n", + " This function returns start_date, end_of_features and final dates\n", + " that help to construct train and test datasets\n", + " \"\"\"\n", + " datecover = {}\n", + " for company in sport:\n", + " df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", + " datetime_col = ['purchase_date'])\n", + " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", + " #print(datecover.keys())\n", + " dt_coverage = compute_time_intersection(datecover)\n", + " start_date = dt_coverage[0]\n", + " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", + " final_date = dt_coverage[-1]\n", + " return start_date, end_of_features, final_date\n", + " \n", + "\n", + "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n", + " \n", + " # Import customerplus\n", + " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", + " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", + " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", + " \n", + " # Filtre de cohérence pour la mise en pratique de notre méthode\n", + " max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n", + " end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n", + " min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n", + "\n", + " #Filtre de la base df_campaigns_information\n", + " df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + " \n", + " #Filtre de la base df_products_purchased_reduced\n", + " df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", + "\n", + " print(\"Data filtering : SUCCESS\")\n", + " \n", + " # Fusion de l'ensemble et creation des KPI\n", + "\n", + " # KPI sur les campagnes publicitaires\n", + " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", + "\n", + " # KPI sur le comportement d'achat\n", + " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", + "\n", + " # KPI sur les données socio-démographiques\n", + " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", + " \n", + " print(\"KPIs construction : SUCCESS\")\n", + " \n", + " # Fusion avec KPI liés au customer\n", + " df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n", + " \n", + " # Fill NaN values\n", + " df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n", + " \n", + " # Fusion avec KPI liés au comportement d'achat\n", + " df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n", + " \n", + " # Fill NaN values\n", + " df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n", + "\n", + " print(\"Explanatory variable construction : SUCCESS\")\n", + "\n", + " # 2. Construction of the explained variable \n", + " df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n", + "\n", + " # Indicatrice d'achat\n", + " df_products_purchased_to_predict['y_has_purchased'] = 1\n", + "\n", + " y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n", + "\n", + " print(\"Explained variable construction : SUCCESS\")\n", + " \n", + " # 3. Merge between explained and explanatory variables\n", + " dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n", + "\n", + " # 0 if there is no purchase\n", + " dataset[['y_has_purchased']].fillna(0) \n", + " \n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "2a746097-0cbf-4bd6-b13b-6ee3e5c36fad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Couverture Company 10 : 2016-03-07 - 2023-09-25\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Couverture Company 11 : 2015-06-26 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + ":13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Couverture Company 12 : 2016-06-14 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Couverture Company 13 : 2010-07-31 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + ":13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Couverture Company 14 : 1901-01-01 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n", + "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n", + "A typical example is when you are setting values in a column of a DataFrame, like:\n", + "\n", + "df[\"col\"][row_indexer] = value\n", + "\n", + "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data filtering : SUCCESS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n", + "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n", + "A typical example is when you are setting values in a column of a DataFrame, like:\n", + "\n", + "df[\"col\"][row_indexer] = value\n", + "\n", + "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data filtering : SUCCESS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + ":13: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n", + "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n", + "A typical example is when you are setting values in a column of a DataFrame, like:\n", + "\n", + "df[\"col\"][row_indexer] = value\n", + "\n", + "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + ":27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data filtering : SUCCESS\n", + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n", + "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n", + "A typical example is when you are setting values in a column of a DataFrame, like:\n", + "\n", + "df[\"col\"][row_indexer] = value\n", + "\n", + "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data filtering : SUCCESS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n", + "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", + ":13: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!\n", + "You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.\n", + "A typical example is when you are setting values in a column of a DataFrame, like:\n", + "\n", + "df[\"col\"][row_indexer] = value\n", + "\n", + "Use `df.loc[row_indexer, \"col\"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", + "/tmp/ipykernel_438/573049956.py:55: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaT' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.\n", + " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data filtering : SUCCESS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KPIs construction : SUCCESS\n", + "Explanatory variable construction : SUCCESS\n", + "Explained variable construction : SUCCESS\n" + ] + } + ], + "source": [ + "# Create test dataset and train dataset for sport companies\n", + "\n", + "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n", + "\n", + "for company in list_of_comp:\n", + " dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n", + " max_date = final_date, directory_path = company) " + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "01900e04-61e7-4a1b-8c9c-b72e42ba9507", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exportation dataset test : SUCCESS\n" + ] + } + ], + "source": [ + " # Exportation\n", + "FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n", + "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n", + " \n", + "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", + " dataset_test.to_csv(file_out, index = False)\n", + " \n", + "print(\"Exportation dataset test : SUCCESS\")" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "b0de2e18-edff-416c-b623-e3e23016029d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projet-bdc2324-team1/Generalization/musique/dataset_test14.csv'" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FILE_PATH_OUT_S3" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "8f56d6ee-82c9-43e2-813d-33d6aaa458dd", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'dataset_test14' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[105], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdataset_test14\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'dataset_test14' is not defined" + ] + } + ], + "source": [ + "dataset_test14" ] }, { "cell_type": "code", "execution_count": null, - "id": "d6767ba6-94ef-43f9-8f67-15ecdb41a70b", + "id": "9232a8df-c51a-4f10-9fc8-ce4f8ad8aab4", "metadata": {}, "outputs": [], "source": []