diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index a8dfc0f..b2c2018 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -72,54 +72,6 @@ { "cell_type": "code", "execution_count": 4, - "id": "aaf64d60-bf92-470c-8210-d09abd6a653e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['bdc2324-data/1/1campaign_stats.csv',\n", - " 'bdc2324-data/1/1campaigns.csv',\n", - " 'bdc2324-data/1/1categories.csv',\n", - " 'bdc2324-data/1/1countries.csv',\n", - " 'bdc2324-data/1/1currencies.csv',\n", - " 'bdc2324-data/1/1customer_target_mappings.csv',\n", - " 'bdc2324-data/1/1customersplus.csv',\n", - " 'bdc2324-data/1/1event_types.csv',\n", - " 'bdc2324-data/1/1events.csv',\n", - " 'bdc2324-data/1/1facilities.csv',\n", - " 'bdc2324-data/1/1link_stats.csv',\n", - " 'bdc2324-data/1/1pricing_formulas.csv',\n", - " 'bdc2324-data/1/1product_packs.csv',\n", - " 'bdc2324-data/1/1products.csv',\n", - " 'bdc2324-data/1/1products_groups.csv',\n", - " 'bdc2324-data/1/1purchases.csv',\n", - " 'bdc2324-data/1/1representation_category_capacities.csv',\n", - " 'bdc2324-data/1/1representations.csv',\n", - " 'bdc2324-data/1/1seasons.csv',\n", - " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", - " 'bdc2324-data/1/1suppliers.csv',\n", - " 'bdc2324-data/1/1tags.csv',\n", - " 'bdc2324-data/1/1target_types.csv',\n", - " 'bdc2324-data/1/1targets.csv',\n", - " 'bdc2324-data/1/1tickets.csv',\n", - " 'bdc2324-data/1/1type_of_categories.csv',\n", - " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", - " 'bdc2324-data/1/1type_ofs.csv']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "liste_database" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ @@ -127,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2240/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_3658/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -158,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d", "metadata": {}, "outputs": [], @@ -199,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f", "metadata": {}, "outputs": [], @@ -221,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "03329e32-00a5-42c8-9470-75f7b6216ccd", "metadata": {}, "outputs": [], @@ -239,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], @@ -282,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", "metadata": {}, "outputs": [ @@ -290,17 +242,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2240/1591303091.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_3658/1591303091.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", - "/tmp/ipykernel_2240/1591303091.py:9: SettingWithCopyWarning: \n", + "/tmp/ipykernel_3658/1591303091.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", - "/tmp/ipykernel_2240/1591303091.py:13: SettingWithCopyWarning: \n", + "/tmp/ipykernel_3658/1591303091.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -314,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9", "metadata": {}, "outputs": [ @@ -405,127 +357,33 @@ "
1826672 rows × 8 columns
\n", "" ], "text/plain": [ - " ticket_id product_id is_from_subscription supplier_name \\\n", - "0 13070859 225251 False vente en ligne \n", - "1 13070860 224914 False vente en ligne \n", - "2 13070861 224914 False vente en ligne \n", - "3 13070862 224914 False vente en ligne \n", - "4 13070863 224914 False vente en ligne \n", - "... ... ... ... ... \n", - "1826667 18643847 350454 False vad \n", - "1826668 19853111 383564 False vad \n", - "1826669 19860514 383751 False vad \n", - "1826670 19860515 383751 False vad \n", - "1826671 19860516 383751 False vad \n", + " ticket_id product_id is_from_subscription supplier_name \\\n", + "0 13070859 225251 False vente en ligne \n", + "1 13070860 224914 False vente en ligne \n", + "2 13070861 224914 False vente en ligne \n", + "3 13070862 224914 False vente en ligne \n", + "4 13070863 224914 False vente en ligne \n", "\n", - " type_of_ticket_name children purchase_date \\\n", - "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", - "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", - "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", - "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", - "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", - "... ... ... ... \n", - "1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n", - "1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n", - "1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", - "1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", - "1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", - "\n", - " customer_id \n", - "0 48187 \n", - "1 48187 \n", - "2 48187 \n", - "3 48187 \n", - "4 48187 \n", - "... ... \n", - "1826667 41 \n", - "1826668 62763 \n", - "1826669 1195566 \n", - "1826670 1195566 \n", - "1826671 1195566 \n", - "\n", - "[1826672 rows x 8 columns]" + " type_of_ticket_name children purchase_date customer_id \n", + "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", + "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", + "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", + "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n", + "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 " ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1_ticket_information" + "df1_ticket_information.head()" ] }, { @@ -538,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], @@ -567,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -577,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 13, "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", "metadata": {}, "outputs": [ @@ -662,121 +520,33 @@ "128365 rows × 7 columns
\n", "" ], "text/plain": [ - " product_id customer_id nb_tickets nb_suppliers \\\n", - "0 107310 2805 4 2 \n", - "1 110089 54355 1 1 \n", - "2 110089 54356 1 1 \n", - "3 110089 54357 1 1 \n", - "4 110089 54358 1 1 \n", - "... ... ... ... ... \n", - "128360 406026 1 11 2 \n", - "128361 406027 1 31 2 \n", - "128362 406028 1 2 1 \n", - "128363 406029 1256130 2 1 \n", - "128364 406029 1256133 3 1 \n", + " product_id customer_id nb_tickets nb_suppliers \\\n", + "0 107310 2805 4 2 \n", + "1 110089 54355 1 1 \n", + "2 110089 54356 1 1 \n", + "3 110089 54357 1 1 \n", + "4 110089 54358 1 1 \n", "\n", - " purchase_date_max purchase_date_min \\\n", - "0 2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00 \n", - "1 2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00 \n", - "2 2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00 \n", - "3 2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00 \n", - "4 2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00 \n", - "... ... ... \n", - "128360 2023-11-08 12:53:31+00:00 2023-11-08 09:30:28+00:00 \n", - "128361 2023-11-08 15:59:11+00:00 2023-11-08 09:15:36+00:00 \n", - "128362 2023-11-08 14:56:08+00:00 2023-11-08 11:18:37+00:00 \n", - "128363 2023-11-08 10:35:43+00:00 2023-11-08 10:35:43+00:00 \n", - "128364 2023-11-08 16:51:19+00:00 2023-11-08 16:51:19+00:00 \n", - "\n", - " time_between_purchase \n", - "0 0 days 00:18:35 \n", - "1 0 days 00:00:00 \n", - "2 0 days 00:00:00 \n", - "3 0 days 00:00:00 \n", - "4 0 days 00:00:00 \n", - "... ... \n", - "128360 0 days 03:23:03 \n", - "128361 0 days 06:43:35 \n", - "128362 0 days 03:37:31 \n", - "128363 0 days 00:00:00 \n", - "128364 0 days 00:00:00 \n", - "\n", - "[128365 rows x 7 columns]" + " purchase_date_max purchase_date_min time_between_purchase \n", + "0 2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00 0 days 00:18:35 \n", + "1 2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00 0 days 00:00:00 \n", + "2 2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00 0 days 00:00:00 \n", + "3 2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00 0 days 00:00:00 \n", + "4 2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00 0 days 00:00:00 " ] }, - "execution_count": 32, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1_tickets_kpi" + "df1_tickets_kpi.head()" ] }, { @@ -826,7 +596,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2240/3848597476.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_3658/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -893,51 +663,18 @@ "283 rows × 1 columns
\n", "" ], "text/plain": [ - " customer_id\n", - "target_name \n", - "consentement optin mediation specialisee 150000\n", - "consentement optin jeune public 149979\n", - "consentement optin b2c 108909\n", - "Arenametrix_bascule tel vers sib 35216\n", - "consentement optout b2c 34523\n", - "... ...\n", - "Automation_parrainage_newsletter_handicap_visuel 1\n", - "consentement optout mediation specialisee 1\n", - "Inscrits NL LSF formulaire 1\n", - "Market auto - contacts inactifs post-scénario 1\n", - "Inactifs - fin du scénario 1\n", - "\n", - "[283 rows x 1 columns]" + " customer_id\n", + "target_name \n", + "consentement optin mediation specialisee 150000\n", + "consentement optin jeune public 149979\n", + "consentement optin b2c 108909\n", + "Arenametrix_bascule tel vers sib 35216\n", + "consentement optout b2c 34523" ] }, "execution_count": 16, @@ -946,7 +683,7 @@ } ], "source": [ - "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)" + "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()" ] }, { @@ -1006,218 +743,18 @@ "6214808 rows × 8 columns
\n", "" ], "text/plain": [ - " id customer_id opened_at \\\n", - "0 19793 112597 NaT \n", - "1 14211 113666 NaT \n", - "2 13150 280561 NaT \n", - "3 7073 101007 2021-03-28 18:11:06+00:00 \n", - "4 5175 103972 NaT \n", - "... ... ... ... \n", - "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n", - "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n", - "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n", - "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n", - "6214807 8304939 294154 NaT \n", + " id customer_id opened_at sent_at \\\n", + "0 19793 112597 NaT 2021-03-28 16:01:09+00:00 \n", + "1 14211 113666 NaT 2021-03-28 16:01:09+00:00 \n", + "2 13150 280561 NaT 2021-03-28 16:00:59+00:00 \n", + "3 7073 101007 2021-03-28 18:11:06+00:00 2021-03-28 16:00:59+00:00 \n", + "4 5175 103972 NaT 2021-03-28 16:01:06+00:00 \n", "\n", - " sent_at delivered_at \\\n", - "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n", - "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n", - "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n", - "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n", - "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n", - "... ... ... \n", - "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n", - "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n", - "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n", - "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n", - "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n", + " delivered_at campaign_name \\\n", + "0 2021-03-28 16:24:18+00:00 Le Mucem chez vous, gardons le lien #22 \n", + "1 2021-03-28 16:21:02+00:00 Le Mucem chez vous, gardons le lien #22 \n", + "2 2021-03-28 16:08:45+00:00 Le Mucem chez vous, gardons le lien #22 \n", + "3 2021-03-28 16:09:47+00:00 Le Mucem chez vous, gardons le lien #22 \n", + "4 2021-03-28 16:05:03+00:00 Le Mucem chez vous, gardons le lien #22 \n", "\n", - " campaign_name campaign_service_id \\\n", - "0 Le Mucem chez vous, gardons le lien #22 404 \n", - "1 Le Mucem chez vous, gardons le lien #22 404 \n", - "2 Le Mucem chez vous, gardons le lien #22 404 \n", - "3 Le Mucem chez vous, gardons le lien #22 404 \n", - "4 Le Mucem chez vous, gardons le lien #22 404 \n", - "... ... ... \n", - "6214803 dre_nov_2023 1318 \n", - "6214804 dre_nov_2023 1318 \n", - "6214805 dre_nov_2023 1318 \n", - "6214806 dre_nov_2023 1318 \n", - "6214807 dre_nov_2023 1318 \n", - "\n", - " campaign_sent_at \n", - "0 2021-03-27 23:00:00+00:00 \n", - "1 2021-03-27 23:00:00+00:00 \n", - "2 2021-03-27 23:00:00+00:00 \n", - "3 2021-03-27 23:00:00+00:00 \n", - "4 2021-03-27 23:00:00+00:00 \n", - "... ... \n", - "6214803 2023-10-23 09:31:17+00:00 \n", - "6214804 2023-10-23 09:31:17+00:00 \n", - "6214805 2023-10-23 09:31:17+00:00 \n", - "6214806 2023-10-23 09:31:17+00:00 \n", - "6214807 2023-10-23 09:31:17+00:00 \n", - "\n", - "[6214808 rows x 8 columns]" + " campaign_service_id campaign_sent_at \n", + "0 404 2021-03-27 23:00:00+00:00 \n", + "1 404 2021-03-27 23:00:00+00:00 \n", + "2 404 2021-03-27 23:00:00+00:00 \n", + "3 404 2021-03-27 23:00:00+00:00 \n", + "4 404 2021-03-27 23:00:00+00:00 " ] }, "execution_count": 20, @@ -1526,7 +963,7 @@ } ], "source": [ - "df1_campaigns_information" + "df1_campaigns_information.head()" ] }, { @@ -1573,7 +1010,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2240/3700263836.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_3658/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1654,81 +1091,17 @@ "130472 rows × 4 columns
\n", "" ], "text/plain": [ - " customer_id nb_campaigns nb_campaigns_opened \\\n", - "0 2 4 0.0 \n", - "1 3 222 124.0 \n", - "2 4 7 7.0 \n", - "3 5 4 0.0 \n", - "4 6 20 0.0 \n", - "... ... ... ... \n", - "130467 1256097 1 1.0 \n", - "130468 1256098 1 0.0 \n", - "130469 1256099 1 0.0 \n", - "130470 1256100 1 0.0 \n", - "130471 1256101 1 0.0 \n", - "\n", - " time_to_open \n", - "0 NaT \n", - "1 1 days 00:28:30.169354838 \n", - "2 1 days 04:31:01.428571428 \n", - "3 NaT \n", - "4 NaT \n", - "... ... \n", - "130467 0 days 02:11:15 \n", - "130468 NaT \n", - "130469 NaT \n", - "130470 NaT \n", - "130471 NaT \n", - "\n", - "[130472 rows x 4 columns]" + " customer_id nb_campaigns nb_campaigns_opened time_to_open\n", + "0 2 4 0.0 NaT\n", + "1 3 222 124.0 1 days 00:28:30.169354838\n", + "2 4 7 7.0 1 days 04:31:01.428571428\n", + "3 5 4 0.0 NaT\n", + "4 6 20 0.0 NaT" ] }, "execution_count": 23, @@ -1737,7 +1110,7 @@ } ], "source": [ - "df1_campaigns_kpi" + "df1_campaigns_kpi.head()" ] }, { @@ -2646,19 +2019,19 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 32, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], "source": [ "# Fusion liée au product\n", - "df1_product_purchased = pd.merge(df1_tickets_kpi, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n", + "df1_products_purchased = pd.merge(df1_tickets_kpi, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n", "\n", "# Fusion liée au customer\n", "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", "\n", "# Fusion product et customer\n", - "df1_customer_product = pd.merge(df1_customer, df1_product_purchased, on = 'customer_id', how = 'left')" + "df1_customer_product = pd.merge(df1_customer, df1_products_purchased, on = 'customer_id', how = 'left')" ] }, {