diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 99d5ea7..5085051 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -79,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_8302/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -205,6 +205,7 @@ " # Base des fournisseurs\n", " suppliers = suppliers[['id', 'name']]\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n", "\n", " # Base des types de billets\n", " type_ofs = type_ofs[['id', 'name', 'children']]\n", @@ -242,17 +243,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/1591303091.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", - "/tmp/ipykernel_15815/1591303091.py:9: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", - "/tmp/ipykernel_15815/1591303091.py:13: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:10: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n", + "/tmp/ipykernel_8302/3092893564.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -433,7 +440,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/3848597476.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -647,19 +654,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -847,7 +854,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/3700263836.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1971,7 +1978,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 33, "id": "3d8b0875-b409-44ce-b688-d9d6758782d3", "metadata": {}, "outputs": [ @@ -2261,7 +2268,7 @@ "[1826672 rows x 14 columns]" ] }, - "execution_count": 53, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2273,7 +2280,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe", "metadata": {}, "outputs": [], @@ -2284,38 +2291,51 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 74, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], "source": [ "def tickets_kpi_function(tickets_information = None):\n", + "\n", " tickets_information_copy = tickets_information.copy()\n", - " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n", - " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n", + "\n", + " # Dummy : Canal de vente en ligne\n", + " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", + " tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", + "\n", + " # Proportion de vente en ligne\n", + " prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n", + " prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n", + " \n", + " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n", " .groupby([ 'customer_id']) # 'event_type_id',\n", " .agg({'ticket_id': 'count', \n", " 'amount' : 'sum',\n", " 'supplier_name': 'nunique',\n", - " 'purchase_date_max' : 'max',\n", - " 'purchase_date' : 'min'})\n", + " 'vente_internet' : 'max',\n", + " 'purchase_date' : ['min', 'max']})\n", " .reset_index()\n", " )\n", + " tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n", " \n", - " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n", - " 'amount' : 'total_amount',\n", - " 'supplier_name' : 'nb_suppliers', \n", - " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n", + " tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n", + " 'amount_sum' : 'total_amount',\n", + " 'supplier_name_nunique' : 'nb_suppliers', \n", + " 'customer_id_' : 'customer_id'}, inplace = True)\n", " \n", " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", - " \n", + "\n", + " tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = 'customer_id', how = 'left')\n", + " tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n", + " \n", " return tickets_kpi\n", " " ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 75, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -2325,8 +2345,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", + "execution_count": 76, + "id": "c78f5ade-c721-49d9-a474-73c217686ed1", "metadata": {}, "outputs": [ { @@ -2354,9 +2374,11 @@ " nb_tickets\n", " total_amount\n", " nb_suppliers\n", - " purchase_date_max\n", + " vente_internet_max\n", " purchase_date_min\n", + " purchase_date_max\n", " time_between_purchase\n", + " nb_tickets_internet\n", " \n", " \n", " \n", @@ -2366,407 +2388,330 @@ " 1256574\n", " 8830567.5\n", " 7\n", - " 2023-11-08 15:59:45+00:00\n", + " 1\n", " 2013-06-10 10:37:58+00:00\n", + " 2023-11-08 15:59:45+00:00\n", " 3803 days 05:21:47\n", + " 3053.0\n", " \n", " \n", - " 3615\n", - " 6733\n", - " 35527\n", - " 1188.0\n", - " 4\n", - " 2023-11-03 09:42:40+00:00\n", - " 2015-09-09 13:48:38+00:00\n", - " 2976 days 19:54:02\n", + " 1\n", + " 2\n", + " 307\n", + " 0.0\n", + " 1\n", + " 0\n", + " 2018-04-07 12:55:07+00:00\n", + " 2020-03-08 12:06:43+00:00\n", + " 700 days 23:11:36\n", + " 0.0\n", " \n", " \n", - " 39\n", - " 41\n", - " 16263\n", - " 37642.0\n", + " 2\n", + " 3\n", " 6\n", - " 2023-10-25 09:13:16+00:00\n", - " 2014-01-23 16:56:57+00:00\n", - " 3561 days 16:16:19\n", - " \n", - " \n", - " 11\n", - " 12\n", - " 5871\n", - " 38767.0\n", - " 2\n", - " 2023-11-04 13:46:59+00:00\n", - " 2018-04-04 07:46:31+00:00\n", - " 2040 days 06:00:28\n", - " \n", - " \n", - " 32809\n", - " 63488\n", - " 5851\n", - " 64350.0\n", + " 110.0\n", " 1\n", - " 2022-08-25 13:08:38+00:00\n", - " 2020-08-18 08:32:57+00:00\n", - " 737 days 04:35:41\n", - " \n", - " \n", - " 3708\n", - " 6916\n", - " 5482\n", - " 51489.5\n", - " 2\n", - " 2021-08-26 12:49:17+00:00\n", - " 2018-03-26 11:13:43+00:00\n", - " 1249 days 01:35:34\n", - " \n", - " \n", - " 32616\n", - " 63194\n", - " 4507\n", - " 13232.0\n", - " 3\n", - " 2022-09-07 12:55:33+00:00\n", - " 2017-11-28 13:52:15+00:00\n", - " 1743 days 23:03:18\n", - " \n", - " \n", - " 78\n", - " 81\n", - " 3562\n", - " 38746.0\n", " 1\n", - " 2022-08-30 11:51:34+00:00\n", - " 2017-01-05 13:04:58+00:00\n", - " 2062 days 22:46:36\n", + " 2019-09-19 15:15:01+00:00\n", + " 2023-09-27 09:13:09+00:00\n", + " 1468 days 17:58:08\n", + " 6.0\n", " \n", " \n", - " 35295\n", - " 84002\n", - " 3403\n", - " 19830.0\n", + " 3\n", " 4\n", - " 2023-11-06 15:59:22+00:00\n", - " 2021-05-28 10:22:33+00:00\n", - " 892 days 05:36:49\n", - " \n", - " \n", - " 3377\n", - " 5618\n", - " 3294\n", - " 31684.5\n", - " 1\n", - " 2022-02-24 07:47:20+00:00\n", - " 2018-10-25 11:04:24+00:00\n", - " 1217 days 20:42:56\n", - " \n", - " \n", - " 30011\n", - " 59259\n", - " 2591\n", - " 4350.0\n", - " 3\n", - " 2023-06-12 14:05:19+00:00\n", - " 2019-11-25 08:52:48+00:00\n", - " 1295 days 05:12:31\n", - " \n", - " \n", - " 34937\n", - " 74876\n", - " 2571\n", - " 2600.0\n", - " 2\n", - " 2023-10-02 08:13:05+00:00\n", - " 2018-02-08 12:54:01+00:00\n", - " 2061 days 19:19:04\n", - " \n", - " \n", - " 270\n", - " 295\n", - " 2570\n", - " 17678.5\n", - " 6\n", - " 2023-10-16 10:19:22+00:00\n", - " 2014-01-24 15:16:17+00:00\n", - " 3551 days 19:03:05\n", - " \n", - " \n", - " 866\n", - " 1221\n", - " 2320\n", - " 9652.0\n", - " 2\n", - " 2022-09-19 12:55:15+00:00\n", - " 2017-03-29 08:00:09+00:00\n", - " 2000 days 04:55:06\n", - " \n", - " \n", - " 1022\n", - " 1429\n", - " 2249\n", - " 3500.0\n", " 4\n", - " 2023-11-06 08:30:37+00:00\n", - " 2014-12-03 14:56:38+00:00\n", - " 3259 days 17:33:59\n", - " \n", - " \n", - " 3922\n", - " 7249\n", - " 1827\n", - " 13385.0\n", + " 41.0\n", " 1\n", - " 2021-10-26 12:28:40+00:00\n", - " 2019-05-07 12:34:56+00:00\n", - " 902 days 23:53:44\n", - " \n", - " \n", - " 54425\n", - " 1070539\n", - " 1800\n", - " 19800.0\n", " 1\n", - " 2022-07-25 12:49:27+00:00\n", - " 2022-05-02 16:09:03+00:00\n", - " 83 days 20:40:24\n", + " 2019-09-19 15:43:49+00:00\n", + " 2021-09-02 18:42:19+00:00\n", + " 714 days 02:58:30\n", + " 4.0\n", " \n", " \n", - " 69520\n", - " 1216801\n", - " 1623\n", - " 12562.0\n", + " 4\n", + " 5\n", " 2\n", - " 2023-09-29 16:34:38+00:00\n", - " 2023-06-16 14:16:04+00:00\n", - " 105 days 02:18:34\n", - " \n", - " \n", - " 30056\n", - " 59330\n", - " 1551\n", - " 0.0\n", + " 19.0\n", " 1\n", - " 2023-11-06 10:22:14+00:00\n", - " 2018-02-02 08:53:51+00:00\n", - " 2103 days 01:28:23\n", - " \n", - " \n", - " 3243\n", - " 5441\n", - " 1544\n", - " 14133.0\n", - " 2\n", - " 2022-09-22 08:21:47+00:00\n", - " 2017-12-14 12:50:23+00:00\n", - " 1742 days 19:31:24\n", - " \n", - " \n", - " 55195\n", - " 1084435\n", - " 1500\n", - " 16500.0\n", " 1\n", - " 2022-09-27 14:32:13+00:00\n", - " 2022-05-18 08:04:41+00:00\n", - " 132 days 06:27:32\n", + " 2019-09-19 15:45:36+00:00\n", + " 2019-09-19 15:45:36+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", " \n", " \n", - " 28983\n", - " 57816\n", - " 1485\n", - " 0.0\n", - " 2\n", - " 2023-05-22 07:30:55+00:00\n", - " 2019-01-21 14:19:18+00:00\n", - " 1581 days 17:11:37\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 2231\n", - " 2942\n", - " 1307\n", - " 100.0\n", - " 2\n", - " 2023-06-29 09:33:58+00:00\n", - " 2017-10-25 15:06:58+00:00\n", - " 2072 days 18:27:00\n", + " 73513\n", + " 1256133\n", + " 3\n", + " 33.0\n", + " 1\n", + " 1\n", + " 2023-11-08 16:51:19+00:00\n", + " 2023-11-08 16:51:19+00:00\n", + " 0 days 00:00:00\n", + " 3.0\n", " \n", " \n", - " 23\n", - " 24\n", - " 1266\n", - " 0.0\n", - " 2\n", - " 2023-10-19 07:20:48+00:00\n", - " 2015-09-30 16:07:52+00:00\n", - " 2940 days 15:12:56\n", - " \n", - " \n", - " 4513\n", - " 9592\n", - " 1211\n", - " 62.0\n", + " 73514\n", + " 1256134\n", " 4\n", - " 2023-10-17 09:39:40+00:00\n", - " 2018-02-25 07:17:19+00:00\n", - " 2060 days 02:22:21\n", - " \n", - " \n", - " 2936\n", - " 5059\n", - " 1186\n", - " 6308.0\n", - " 3\n", - " 2023-05-22 13:41:22+00:00\n", - " 2018-02-01 11:16:51+00:00\n", - " 1936 days 02:24:31\n", - " \n", - " \n", - " 11484\n", - " 25100\n", - " 1123\n", - " 0.0\n", + " 44.0\n", " 1\n", - " 2021-07-13 07:39:57+00:00\n", - " 2015-12-21 15:38:05+00:00\n", - " 2030 days 16:01:52\n", - " \n", - " \n", - " 934\n", - " 1326\n", - " 1098\n", - " 798.0\n", - " 3\n", - " 2023-02-01 08:39:45+00:00\n", - " 2018-02-13 13:13:48+00:00\n", - " 1813 days 19:25:57\n", - " \n", - " \n", - " 30156\n", - " 59490\n", - " 1088\n", - " 0.0\n", " 1\n", - " 2023-10-05 08:23:50+00:00\n", - " 2019-12-06 12:59:20+00:00\n", - " 1398 days 19:24:30\n", + " 2023-11-08 17:17:51+00:00\n", + " 2023-11-08 17:17:51+00:00\n", + " 0 days 00:00:00\n", + " 4.0\n", " \n", " \n", - " 36478\n", - " 251268\n", - " 1086\n", - " 0.0\n", + " 73515\n", + " 1256135\n", + " 1\n", + " 11.0\n", + " 1\n", + " 1\n", + " 2023-11-08 17:23:54+00:00\n", + " 2023-11-08 17:23:54+00:00\n", + " 0 days 00:00:00\n", + " 1.0\n", + " \n", + " \n", + " 73516\n", + " 1256136\n", " 2\n", - " 2023-06-30 07:22:46+00:00\n", - " 2018-02-02 09:06:22+00:00\n", - " 1973 days 22:16:24\n", + " 22.0\n", + " 1\n", + " 1\n", + " 2023-11-08 18:32:18+00:00\n", + " 2023-11-08 18:32:18+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", + " \n", + " \n", + " 73517\n", + " 1256137\n", + " 2\n", + " 22.0\n", + " 1\n", + " 1\n", + " 2023-11-08 19:30:28+00:00\n", + " 2023-11-08 19:30:28+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", " \n", " \n", "\n", + "

73518 rows × 9 columns

\n", "" ], "text/plain": [ " customer_id nb_tickets total_amount nb_suppliers \\\n", "0 1 1256574 8830567.5 7 \n", - "3615 6733 35527 1188.0 4 \n", - "39 41 16263 37642.0 6 \n", - "11 12 5871 38767.0 2 \n", - "32809 63488 5851 64350.0 1 \n", - "3708 6916 5482 51489.5 2 \n", - "32616 63194 4507 13232.0 3 \n", - "78 81 3562 38746.0 1 \n", - "35295 84002 3403 19830.0 4 \n", - "3377 5618 3294 31684.5 1 \n", - "30011 59259 2591 4350.0 3 \n", - "34937 74876 2571 2600.0 2 \n", - "270 295 2570 17678.5 6 \n", - "866 1221 2320 9652.0 2 \n", - "1022 1429 2249 3500.0 4 \n", - "3922 7249 1827 13385.0 1 \n", - "54425 1070539 1800 19800.0 1 \n", - "69520 1216801 1623 12562.0 2 \n", - "30056 59330 1551 0.0 1 \n", - "3243 5441 1544 14133.0 2 \n", - "55195 1084435 1500 16500.0 1 \n", - "28983 57816 1485 0.0 2 \n", - "2231 2942 1307 100.0 2 \n", - "23 24 1266 0.0 2 \n", - "4513 9592 1211 62.0 4 \n", - "2936 5059 1186 6308.0 3 \n", - "11484 25100 1123 0.0 1 \n", - "934 1326 1098 798.0 3 \n", - "30156 59490 1088 0.0 1 \n", - "36478 251268 1086 0.0 2 \n", + "1 2 307 0.0 1 \n", + "2 3 6 110.0 1 \n", + "3 4 4 41.0 1 \n", + "4 5 2 19.0 1 \n", + "... ... ... ... ... \n", + "73513 1256133 3 33.0 1 \n", + "73514 1256134 4 44.0 1 \n", + "73515 1256135 1 11.0 1 \n", + "73516 1256136 2 22.0 1 \n", + "73517 1256137 2 22.0 1 \n", "\n", - " purchase_date_max purchase_date_min \\\n", - "0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n", - "3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n", - "39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n", - "11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n", - "32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n", - "3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n", - "32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n", - "78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n", - "35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n", - "3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n", - "30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n", - "34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n", - "270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n", - "866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n", - "1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n", - "3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n", - "54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n", - "69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n", - "30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n", - "3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n", - "55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n", - "28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n", - "2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n", - "23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n", - "4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n", - "2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n", - "11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n", - "934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n", - "30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n", - "36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n", + " vente_internet_max purchase_date_min purchase_date_max \\\n", + "0 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n", + "1 0 2018-04-07 12:55:07+00:00 2020-03-08 12:06:43+00:00 \n", + "2 1 2019-09-19 15:15:01+00:00 2023-09-27 09:13:09+00:00 \n", + "3 1 2019-09-19 15:43:49+00:00 2021-09-02 18:42:19+00:00 \n", + "4 1 2019-09-19 15:45:36+00:00 2019-09-19 15:45:36+00:00 \n", + "... ... ... ... \n", + "73513 1 2023-11-08 16:51:19+00:00 2023-11-08 16:51:19+00:00 \n", + "73514 1 2023-11-08 17:17:51+00:00 2023-11-08 17:17:51+00:00 \n", + "73515 1 2023-11-08 17:23:54+00:00 2023-11-08 17:23:54+00:00 \n", + "73516 1 2023-11-08 18:32:18+00:00 2023-11-08 18:32:18+00:00 \n", + "73517 1 2023-11-08 19:30:28+00:00 2023-11-08 19:30:28+00:00 \n", "\n", - " time_between_purchase \n", - "0 3803 days 05:21:47 \n", - "3615 2976 days 19:54:02 \n", - "39 3561 days 16:16:19 \n", - "11 2040 days 06:00:28 \n", - "32809 737 days 04:35:41 \n", - "3708 1249 days 01:35:34 \n", - "32616 1743 days 23:03:18 \n", - "78 2062 days 22:46:36 \n", - "35295 892 days 05:36:49 \n", - "3377 1217 days 20:42:56 \n", - "30011 1295 days 05:12:31 \n", - "34937 2061 days 19:19:04 \n", - "270 3551 days 19:03:05 \n", - "866 2000 days 04:55:06 \n", - "1022 3259 days 17:33:59 \n", - "3922 902 days 23:53:44 \n", - "54425 83 days 20:40:24 \n", - "69520 105 days 02:18:34 \n", - "30056 2103 days 01:28:23 \n", - "3243 1742 days 19:31:24 \n", - "55195 132 days 06:27:32 \n", - "28983 1581 days 17:11:37 \n", - "2231 2072 days 18:27:00 \n", - "23 2940 days 15:12:56 \n", - "4513 2060 days 02:22:21 \n", - "2936 1936 days 02:24:31 \n", - "11484 2030 days 16:01:52 \n", - "934 1813 days 19:25:57 \n", - "30156 1398 days 19:24:30 \n", - "36478 1973 days 22:16:24 " + " time_between_purchase nb_tickets_internet \n", + "0 3803 days 05:21:47 3053.0 \n", + "1 700 days 23:11:36 0.0 \n", + "2 1468 days 17:58:08 6.0 \n", + "3 714 days 02:58:30 4.0 \n", + "4 0 days 00:00:00 2.0 \n", + "... ... ... \n", + "73513 0 days 00:00:00 3.0 \n", + "73514 0 days 00:00:00 4.0 \n", + "73515 0 days 00:00:00 1.0 \n", + "73516 0 days 00:00:00 2.0 \n", + "73517 0 days 00:00:00 2.0 \n", + "\n", + "[73518 rows x 9 columns]" ] }, - "execution_count": 52, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "df1_tickets_kpi" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "de1ecf15-1aa1-4aa2-9467-8ad8e9be5856", + "metadata": {}, + "outputs": [], + "source": [ + " df_tickets_information_copy = df1_products_purchased_reduced.copy()\n", + "\n", + " # Dummy : Canal de vente en ligne\n", + " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", + " df_tickets_information_copy['vente_internet'] = df_tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", + "\n", + " # Proportion de vente en ligne\n", + " prop_vente_internet = df_tickets_information_copy[df_tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "9cd36178-11dc-409c-b148-fb1d208c2faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idticket_id
013053
136
244
352
462
.........
5674412561333
5674512561344
5674612561351
5674712561362
5674812561372
\n", + "

56749 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " customer_id ticket_id\n", + "0 1 3053\n", + "1 3 6\n", + "2 4 4\n", + "3 5 2\n", + "4 6 2\n", + "... ... ...\n", + "56744 1256133 3\n", + "56745 1256134 4\n", + "56746 1256135 1\n", + "56747 1256136 2\n", + "56748 1256137 2\n", + "\n", + "[56749 rows x 2 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prop_vente_internet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", + "metadata": {}, + "outputs": [], "source": [ "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)" ] @@ -2781,7 +2726,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], @@ -2792,339 +2737,20 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "9740d64a-e5eb-4967-a534-ca6177546465", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguage...average_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaN...NaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "0 12751 NaN 2 False 1 True \n", - "1 12825 NaN 2 False 2 True \n", - "2 11261 NaN 2 False 1 True \n", - "3 13071 NaN 2 False 2 True \n", - "4 653061 NaN 10 False 2 True \n", - "\n", - " opt_in structure_id profession language ... average_ticket_basket \\\n", - "0 True NaN NaN NaN ... NaN \n", - "1 True NaN NaN NaN ... NaN \n", - "2 True NaN NaN NaN ... NaN \n", - "3 True NaN NaN NaN ... NaN \n", - "4 False NaN NaN NaN ... NaN \n", - "\n", - " total_price purchase_count first_buying_date country age tenant_id \\\n", - "0 NaN 0 NaT fr NaN 1311 \n", - "1 NaN 0 NaT fr NaN 1311 \n", - "2 NaN 0 NaT fr NaN 1311 \n", - "3 NaN 0 NaT fr NaN 1311 \n", - "4 NaN 0 NaT NaN NaN 1311 \n", - "\n", - " nb_campaigns nb_campaigns_opened time_to_open \n", - "0 NaN NaN NaT \n", - "1 NaN NaN NaT \n", - "2 NaN NaN NaT \n", - "3 NaN NaN NaT \n", - "4 80.0 2.0 0 days 19:53:02.500000 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df1_customer.head()" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
582011NaN2False2TrueFalseNaNNaNNaNNaN2023-11-08 03:20:0745.012547757.030122330831-67.79096913.751531.9560878821221.56414722013-06-10 10:37:58+00:00frNaN1311NaNNaNNaT
\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "58201 1 NaN 2 False 2 True \n", - "\n", - " opt_in structure_id profession language mcp_contact_id \\\n", - "58201 False NaN NaN NaN NaN \n", - "\n", - " last_buying_date max_price ticket_sum average_price fidelity \\\n", - "58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n", - "\n", - " average_purchase_delay average_price_basket average_ticket_basket \\\n", - "58201 -67.790969 13.75153 1.956087 \n", - "\n", - " total_price purchase_count first_buying_date country age \\\n", - "58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n", - "\n", - " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n", - "58201 1311 NaN NaN NaT " - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pd.set_option('display.max_columns', None)\n", "\n", @@ -3134,7 +2760,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [], @@ -3145,7 +2771,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f", "metadata": {}, "outputs": [],