From 9b7d6a57850aabbd660835c0d4934330689040b5 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Wed, 7 Feb 2024 22:28:55 +0000 Subject: [PATCH 1/8] Ajout KPI + debut traitement NLP --- 0_Cleaning_and_merge.ipynb | 1545 ++++++++++++++++++++++++++++++----- Exploration_billet_AJ.ipynb | 527 ++++++------ 2 files changed, 1574 insertions(+), 498 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 9f3f20b..99d5ea7 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -79,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_492/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_15815/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -242,17 +242,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_492/1591303091.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1591303091.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", - "/tmp/ipykernel_492/1591303091.py:9: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1591303091.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", - "/tmp/ipykernel_492/1591303091.py:13: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1591303091.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -386,169 +386,6 @@ "df1_ticket_information.head()" ] }, - { - "cell_type": "markdown", - "id": "37499eae-1a7f-4dce-83b0-ff942ccf7a9d", - "metadata": {}, - "source": [ - "### KPI tickets" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "043303fe-e90f-4689-a2a9-5d690555a045", - "metadata": {}, - "outputs": [], - "source": [ - "def tickets_kpi_function(tickets_information = None):\n", - " tickets_information_copy = tickets_information.copy()\n", - " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n", - " tickets_kpi = (tickets_information_copy[['product_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max']]\n", - " .groupby(['product_id', 'customer_id'])\n", - " .agg({'ticket_id': 'count', \n", - " 'supplier_name': 'nunique',\n", - " 'purchase_date_max' : 'max',\n", - " 'purchase_date' : 'min'})\n", - " .reset_index()\n", - " )\n", - " \n", - " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n", - " 'supplier_name' : 'nb_suppliers', \n", - " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n", - " \n", - " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", - " \n", - " return tickets_kpi\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5882234a-1ed5-4269-87a6-0d75613476e3", - "metadata": {}, - "outputs": [], - "source": [ - "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_ticket_information)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idcustomer_idnb_ticketsnb_supplierspurchase_date_maxpurchase_date_mintime_between_purchase
01073102805422019-06-05 14:37:13+00:002019-06-05 14:18:38+00:000 days 00:18:35
111008954355112017-02-17 13:32:51+00:002017-02-17 13:32:51+00:000 days 00:00:00
211008954356112017-03-02 14:36:16+00:002017-03-02 14:36:16+00:000 days 00:00:00
311008954357112017-03-06 15:16:41+00:002017-03-06 15:16:41+00:000 days 00:00:00
411008954358112017-03-13 16:07:27+00:002017-03-13 16:07:27+00:000 days 00:00:00
\n", - "
" - ], - "text/plain": [ - " product_id customer_id nb_tickets nb_suppliers \\\n", - "0 107310 2805 4 2 \n", - "1 110089 54355 1 1 \n", - "2 110089 54356 1 1 \n", - "3 110089 54357 1 1 \n", - "4 110089 54358 1 1 \n", - "\n", - " purchase_date_max purchase_date_min time_between_purchase \n", - "0 2019-06-05 14:37:13+00:00 2019-06-05 14:18:38+00:00 0 days 00:18:35 \n", - "1 2017-02-17 13:32:51+00:00 2017-02-17 13:32:51+00:00 0 days 00:00:00 \n", - "2 2017-03-02 14:36:16+00:00 2017-03-02 14:36:16+00:00 0 days 00:00:00 \n", - "3 2017-03-06 15:16:41+00:00 2017-03-06 15:16:41+00:00 0 days 00:00:00 \n", - "4 2017-03-13 16:07:27+00:00 2017-03-13 16:07:27+00:00 0 days 00:00:00 " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_tickets_kpi.head()" - ] - }, { "cell_type": "markdown", "id": "096e47f4-1d65-4575-989d-83227eedad2b", @@ -559,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "baed146a-9d3a-4397-a812-3d50c9a2f038", "metadata": {}, "outputs": [], @@ -588,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "5fbfd88b-b94c-489c-9201-670e96e453e7", "metadata": {}, "outputs": [ @@ -596,7 +433,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_492/3848597476.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -610,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09", "metadata": {}, "outputs": [ @@ -677,7 +514,7 @@ "consentement optout b2c 34523" ] }, - "execution_count": 16, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -688,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed", "metadata": { "scrolled": true @@ -757,7 +594,7 @@ "DDCP MD Procès du Siècle 1684" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -777,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85", "metadata": {}, "outputs": [], @@ -802,7 +639,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f", "metadata": {}, "outputs": [ @@ -810,19 +647,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_492/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_492/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_492/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -837,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "id": "c24457e7-3cad-451a-a65b-7373b656bd6e", "metadata": { "scrolled": true @@ -957,7 +794,7 @@ "4 404 2021-03-27 23:00:00+00:00 " ] }, - "execution_count": 20, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -968,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 18, "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "metadata": {}, "outputs": [], @@ -1002,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, "id": "24537647-bc29-4777-9848-ac4120a4aa60", "metadata": {}, "outputs": [ @@ -1010,7 +847,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_492/3700263836.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15815/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1024,7 +861,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", "metadata": {}, "outputs": [ @@ -1104,7 +941,7 @@ "4 6 20 0.0 NaT" ] }, - "execution_count": 23, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1131,7 +968,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d", "metadata": {}, "outputs": [], @@ -1142,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c", "metadata": {}, "outputs": [], @@ -1213,7 +1050,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, "id": "350b09b9-451f-4d47-81fe-f34b892db027", "metadata": {}, "outputs": [], @@ -1301,7 +1138,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "id": "0fccc8ef-e575-4857-a401-94a7274394df", "metadata": {}, "outputs": [ @@ -1454,7 +1291,7 @@ "4 indiv entrées tp " ] }, - "execution_count": 27, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1466,7 +1303,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "id": "779d8aaf-6668-4f66-8852-847304407ea3", "metadata": {}, "outputs": [ @@ -1636,7 +1473,7 @@ "4 spectacle vivant mucem " ] }, - "execution_count": 28, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1648,7 +1485,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0", "metadata": {}, "outputs": [ @@ -1747,7 +1584,7 @@ "4 37 383 269 1" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1767,7 +1604,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba", "metadata": {}, "outputs": [], @@ -1789,13 +1626,13 @@ " products_global = order_columns_id(products_global)\n", "\n", " # remove useless columns \n", - " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n", + " products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'\n", " return products_global" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951", "metadata": {}, "outputs": [ @@ -1849,12 +1686,15 @@ " id_representation_cap\n", " season_id\n", " facility_id\n", - " event_type_id\n", + " ...\n", " event_type_key_id\n", " facility_key_id\n", " street_id\n", " amount\n", " is_full_price\n", + " name_categories\n", + " name_events\n", + " name_seasons\n", " name_event_types\n", " name_facilities\n", " \n", @@ -1872,12 +1712,15 @@ " 8789\n", " 4\n", " 1\n", - " 2\n", + " ...\n", " 5\n", " 1\n", " 1\n", " 9.0\n", " False\n", + " indiv activité tr\n", + " visite-jeu \"le classico des minots\" (1h30)\n", + " 2017\n", " offre muséale individuel\n", " mucem\n", " \n", @@ -1893,12 +1736,15 @@ " 390\n", " 2\n", " 1\n", - " 2\n", + " ...\n", " 2\n", " 1\n", " 1\n", " 9.5\n", " False\n", + " indiv entrées tp\n", + " billet mucem picasso\n", + " 2016\n", " offre muséale individuel\n", " mucem\n", " \n", @@ -1914,12 +1760,15 @@ " 395\n", " 2\n", " 1\n", - " 2\n", + " ...\n", " 2\n", " 1\n", " 1\n", " 11.5\n", " False\n", + " indiv entrées tp\n", + " billet mucem picasso\n", + " 2016\n", " offre muséale individuel\n", " mucem\n", " \n", @@ -1935,12 +1784,15 @@ " 120199\n", " 1754\n", " 1\n", - " 2\n", + " ...\n", " 4\n", " 1\n", " 1\n", " 8.0\n", " False\n", + " indiv entrées tr\n", + " NaN\n", + " NaN\n", " offre muséale individuel\n", " mucem\n", " \n", @@ -1956,17 +1808,21 @@ " 21\n", " 4\n", " 1\n", - " 3\n", + " ...\n", " 6\n", " 1\n", " 1\n", " 8.5\n", " False\n", + " indiv entrées tp\n", + " non défini\n", + " 2017\n", " non défini\n", " mucem\n", " \n", " \n", "\n", + "

5 rows × 21 columns

\n", "" ], "text/plain": [ @@ -1984,22 +1840,38 @@ "3 156773 1 12365 120199 \n", "4 1175 1 8 21 \n", "\n", - " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n", - "0 4 1 2 5 1 \n", - "1 2 1 2 2 1 \n", - "2 2 1 2 2 1 \n", - "3 1754 1 2 4 1 \n", - "4 4 1 3 6 1 \n", + " season_id facility_id ... event_type_key_id facility_key_id street_id \\\n", + "0 4 1 ... 5 1 1 \n", + "1 2 1 ... 2 1 1 \n", + "2 2 1 ... 2 1 1 \n", + "3 1754 1 ... 4 1 1 \n", + "4 4 1 ... 6 1 1 \n", "\n", - " street_id amount is_full_price name_event_types name_facilities \n", - "0 1 9.0 False offre muséale individuel mucem \n", - "1 1 9.5 False offre muséale individuel mucem \n", - "2 1 11.5 False offre muséale individuel mucem \n", - "3 1 8.0 False offre muséale individuel mucem \n", - "4 1 8.5 False non défini mucem " + " amount is_full_price name_categories \\\n", + "0 9.0 False indiv activité tr \n", + "1 9.5 False indiv entrées tp \n", + "2 11.5 False indiv entrées tp \n", + "3 8.0 False indiv entrées tr \n", + "4 8.5 False indiv entrées tp \n", + "\n", + " name_events name_seasons \\\n", + "0 visite-jeu \"le classico des minots\" (1h30) 2017 \n", + "1 billet mucem picasso 2016 \n", + "2 billet mucem picasso 2016 \n", + "3 NaN NaN \n", + "4 non défini 2017 \n", + "\n", + " name_event_types name_facilities \n", + "0 offre muséale individuel mucem \n", + "1 offre muséale individuel mucem \n", + "2 offre muséale individuel mucem \n", + "3 offre muséale individuel mucem \n", + "4 non défini mucem \n", + "\n", + "[5 rows x 21 columns]" ] }, - "execution_count": 31, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2011,7 +1883,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 29, "id": "98f78cd5-b694-4cc6-b033-20170aa13e8d", "metadata": {}, "outputs": [], @@ -2020,6 +1892,885 @@ "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "52db7bcb-3fb7-48e5-b612-4e22bdab4a94", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2", + "metadata": {}, + "source": [ + "### KPI tickets" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "665a5925-9c0e-425a-8f11-c33a0a9ec444", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ticket_id', 'product_id', 'is_from_subscription', 'supplier_name',\n", + " 'type_of_ticket_name', 'children', 'purchase_date', 'customer_id',\n", + " 'id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n", + " 'products_group_id', 'product_pack_id', 'event_id',\n", + " 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n", + " 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n", + " 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n", + " 'name_event_types', 'name_facilities'],\n", + " dtype='object')" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_products_purchased.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b913a69e-3146-4919-b5f6-a6108532bffa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['spectacle vivant', 'offre muséale individuel', 'formule adhésion',\n", + " 'offre muséale groupe'], dtype=object)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_products_purchased['name_event_types'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "e01e8cf9-1187-4a4b-993d-b7b4321cd8f0", + "metadata": {}, + "outputs": [], + "source": [ + "df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3d8b0875-b409-44ce-b688-d9d6758782d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idcustomer_idevent_type_idsupplier_namepurchase_datetype_of_ticket_nameamountchildrenis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasons
013070859481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
113070855481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
213070856481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
313070857481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
413070858481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
.............................................
182666718643494814vad2022-08-02 12:18:16+00:00Billet en nombre11.0pricing_formulaFalsespectacle vivantmucemen nb entrées trNaN2022
182666818643495814vad2022-08-02 12:18:16+00:00Billet en nombre11.0pricing_formulaFalsespectacle vivantmucemen nb entrées trNaN2022
182666918643496814vad2022-08-02 12:18:16+00:00Billet en nombre11.0pricing_formulaFalsespectacle vivantmucemen nb entrées trNaN2022
182667018643497814vad2022-08-02 12:18:16+00:00Billet en nombre11.0pricing_formulaFalsespectacle vivantmucemen nb entrées trNaN2022
182667119853111627634vad2022-11-04 14:25:42+00:00Billet en nombre0.0pricing_formulaFalsespectacle vivantmucemindiv entrées grNaN2022
\n", + "

1826672 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id customer_id event_type_id supplier_name \\\n", + "0 13070859 48187 4 vente en ligne \n", + "1 13070855 48187 4 vente en ligne \n", + "2 13070856 48187 4 vente en ligne \n", + "3 13070857 48187 4 vente en ligne \n", + "4 13070858 48187 4 vente en ligne \n", + "... ... ... ... ... \n", + "1826667 18643494 81 4 vad \n", + "1826668 18643495 81 4 vad \n", + "1826669 18643496 81 4 vad \n", + "1826670 18643497 81 4 vad \n", + "1826671 19853111 62763 4 vad \n", + "\n", + " purchase_date type_of_ticket_name amount \\\n", + "0 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", + "1 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", + "2 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", + "3 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", + "4 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", + "... ... ... ... \n", + "1826667 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", + "1826668 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", + "1826669 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", + "1826670 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", + "1826671 2022-11-04 14:25:42+00:00 Billet en nombre 0.0 \n", + "\n", + " children is_full_price name_event_types name_facilities \\\n", + "0 pricing_formula False spectacle vivant mucem \n", + "1 pricing_formula False spectacle vivant mucem \n", + "2 pricing_formula False spectacle vivant mucem \n", + "3 pricing_formula False spectacle vivant mucem \n", + "4 pricing_formula False spectacle vivant mucem \n", + "... ... ... ... ... \n", + "1826667 pricing_formula False spectacle vivant mucem \n", + "1826668 pricing_formula False spectacle vivant mucem \n", + "1826669 pricing_formula False spectacle vivant mucem \n", + "1826670 pricing_formula False spectacle vivant mucem \n", + "1826671 pricing_formula False spectacle vivant mucem \n", + "\n", + " name_categories name_events name_seasons \n", + "0 indiv prog enfant l'école des magiciens 2018 \n", + "1 indiv prog enfant l'école des magiciens 2018 \n", + "2 indiv prog enfant l'école des magiciens 2018 \n", + "3 indiv prog enfant l'école des magiciens 2018 \n", + "4 indiv prog enfant l'école des magiciens 2018 \n", + "... ... ... ... \n", + "1826667 en nb entrées tr NaN 2022 \n", + "1826668 en nb entrées tr NaN 2022 \n", + "1826669 en nb entrées tr NaN 2022 \n", + "1826670 en nb entrées tr NaN 2022 \n", + "1826671 indiv entrées gr NaN 2022 \n", + "\n", + "[1826672 rows x 14 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Importance des suppliers\n", + "df1_products_purchased_reduced" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe", + "metadata": {}, + "outputs": [], + "source": [ + "# Nombre de client assistant à plus de 2 type d'événement\n", + "nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "043303fe-e90f-4689-a2a9-5d690555a045", + "metadata": {}, + "outputs": [], + "source": [ + "def tickets_kpi_function(tickets_information = None):\n", + " tickets_information_copy = tickets_information.copy()\n", + " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n", + " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n", + " .groupby([ 'customer_id']) # 'event_type_id',\n", + " .agg({'ticket_id': 'count', \n", + " 'amount' : 'sum',\n", + " 'supplier_name': 'nunique',\n", + " 'purchase_date_max' : 'max',\n", + " 'purchase_date' : 'min'})\n", + " .reset_index()\n", + " )\n", + " \n", + " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n", + " 'amount' : 'total_amount',\n", + " 'supplier_name' : 'nb_suppliers', \n", + " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n", + " \n", + " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", + " \n", + " return tickets_kpi\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "5882234a-1ed5-4269-87a6-0d75613476e3", + "metadata": {}, + "outputs": [], + "source": [ + "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idnb_ticketstotal_amountnb_supplierspurchase_date_maxpurchase_date_mintime_between_purchase
0112565748830567.572023-11-08 15:59:45+00:002013-06-10 10:37:58+00:003803 days 05:21:47
36156733355271188.042023-11-03 09:42:40+00:002015-09-09 13:48:38+00:002976 days 19:54:02
39411626337642.062023-10-25 09:13:16+00:002014-01-23 16:56:57+00:003561 days 16:16:19
1112587138767.022023-11-04 13:46:59+00:002018-04-04 07:46:31+00:002040 days 06:00:28
3280963488585164350.012022-08-25 13:08:38+00:002020-08-18 08:32:57+00:00737 days 04:35:41
37086916548251489.522021-08-26 12:49:17+00:002018-03-26 11:13:43+00:001249 days 01:35:34
3261663194450713232.032022-09-07 12:55:33+00:002017-11-28 13:52:15+00:001743 days 23:03:18
7881356238746.012022-08-30 11:51:34+00:002017-01-05 13:04:58+00:002062 days 22:46:36
3529584002340319830.042023-11-06 15:59:22+00:002021-05-28 10:22:33+00:00892 days 05:36:49
33775618329431684.512022-02-24 07:47:20+00:002018-10-25 11:04:24+00:001217 days 20:42:56
300115925925914350.032023-06-12 14:05:19+00:002019-11-25 08:52:48+00:001295 days 05:12:31
349377487625712600.022023-10-02 08:13:05+00:002018-02-08 12:54:01+00:002061 days 19:19:04
270295257017678.562023-10-16 10:19:22+00:002014-01-24 15:16:17+00:003551 days 19:03:05
866122123209652.022022-09-19 12:55:15+00:002017-03-29 08:00:09+00:002000 days 04:55:06
1022142922493500.042023-11-06 08:30:37+00:002014-12-03 14:56:38+00:003259 days 17:33:59
39227249182713385.012021-10-26 12:28:40+00:002019-05-07 12:34:56+00:00902 days 23:53:44
544251070539180019800.012022-07-25 12:49:27+00:002022-05-02 16:09:03+00:0083 days 20:40:24
695201216801162312562.022023-09-29 16:34:38+00:002023-06-16 14:16:04+00:00105 days 02:18:34
300565933015510.012023-11-06 10:22:14+00:002018-02-02 08:53:51+00:002103 days 01:28:23
32435441154414133.022022-09-22 08:21:47+00:002017-12-14 12:50:23+00:001742 days 19:31:24
551951084435150016500.012022-09-27 14:32:13+00:002022-05-18 08:04:41+00:00132 days 06:27:32
289835781614850.022023-05-22 07:30:55+00:002019-01-21 14:19:18+00:001581 days 17:11:37
223129421307100.022023-06-29 09:33:58+00:002017-10-25 15:06:58+00:002072 days 18:27:00
232412660.022023-10-19 07:20:48+00:002015-09-30 16:07:52+00:002940 days 15:12:56
45139592121162.042023-10-17 09:39:40+00:002018-02-25 07:17:19+00:002060 days 02:22:21
2936505911866308.032023-05-22 13:41:22+00:002018-02-01 11:16:51+00:001936 days 02:24:31
114842510011230.012021-07-13 07:39:57+00:002015-12-21 15:38:05+00:002030 days 16:01:52
93413261098798.032023-02-01 08:39:45+00:002018-02-13 13:13:48+00:001813 days 19:25:57
301565949010880.012023-10-05 08:23:50+00:002019-12-06 12:59:20+00:001398 days 19:24:30
3647825126810860.022023-06-30 07:22:46+00:002018-02-02 09:06:22+00:001973 days 22:16:24
\n", + "
" + ], + "text/plain": [ + " customer_id nb_tickets total_amount nb_suppliers \\\n", + "0 1 1256574 8830567.5 7 \n", + "3615 6733 35527 1188.0 4 \n", + "39 41 16263 37642.0 6 \n", + "11 12 5871 38767.0 2 \n", + "32809 63488 5851 64350.0 1 \n", + "3708 6916 5482 51489.5 2 \n", + "32616 63194 4507 13232.0 3 \n", + "78 81 3562 38746.0 1 \n", + "35295 84002 3403 19830.0 4 \n", + "3377 5618 3294 31684.5 1 \n", + "30011 59259 2591 4350.0 3 \n", + "34937 74876 2571 2600.0 2 \n", + "270 295 2570 17678.5 6 \n", + "866 1221 2320 9652.0 2 \n", + "1022 1429 2249 3500.0 4 \n", + "3922 7249 1827 13385.0 1 \n", + "54425 1070539 1800 19800.0 1 \n", + "69520 1216801 1623 12562.0 2 \n", + "30056 59330 1551 0.0 1 \n", + "3243 5441 1544 14133.0 2 \n", + "55195 1084435 1500 16500.0 1 \n", + "28983 57816 1485 0.0 2 \n", + "2231 2942 1307 100.0 2 \n", + "23 24 1266 0.0 2 \n", + "4513 9592 1211 62.0 4 \n", + "2936 5059 1186 6308.0 3 \n", + "11484 25100 1123 0.0 1 \n", + "934 1326 1098 798.0 3 \n", + "30156 59490 1088 0.0 1 \n", + "36478 251268 1086 0.0 2 \n", + "\n", + " purchase_date_max purchase_date_min \\\n", + "0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n", + "3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n", + "39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n", + "11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n", + "32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n", + "3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n", + "32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n", + "78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n", + "35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n", + "3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n", + "30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n", + "34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n", + "270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n", + "866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n", + "1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n", + "3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n", + "54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n", + "69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n", + "30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n", + "3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n", + "55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n", + "28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n", + "2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n", + "23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n", + "4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n", + "2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n", + "11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n", + "934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n", + "30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n", + "36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n", + "\n", + " time_between_purchase \n", + "0 3803 days 05:21:47 \n", + "3615 2976 days 19:54:02 \n", + "39 3561 days 16:16:19 \n", + "11 2040 days 06:00:28 \n", + "32809 737 days 04:35:41 \n", + "3708 1249 days 01:35:34 \n", + "32616 1743 days 23:03:18 \n", + "78 2062 days 22:46:36 \n", + "35295 892 days 05:36:49 \n", + "3377 1217 days 20:42:56 \n", + "30011 1295 days 05:12:31 \n", + "34937 2061 days 19:19:04 \n", + "270 3551 days 19:03:05 \n", + "866 2000 days 04:55:06 \n", + "1022 3259 days 17:33:59 \n", + "3922 902 days 23:53:44 \n", + "54425 83 days 20:40:24 \n", + "69520 105 days 02:18:34 \n", + "30056 2103 days 01:28:23 \n", + "3243 1742 days 19:31:24 \n", + "55195 132 days 06:27:32 \n", + "28983 1581 days 17:11:37 \n", + "2231 2072 days 18:27:00 \n", + "23 2940 days 15:12:56 \n", + "4513 2060 days 02:22:21 \n", + "2936 1936 days 02:24:31 \n", + "11484 2030 days 16:01:52 \n", + "934 1813 days 19:25:57 \n", + "30156 1398 days 19:24:30 \n", + "36478 1973 days 22:16:24 " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)" + ] + }, { "cell_type": "markdown", "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7", @@ -2030,7 +2781,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], @@ -2041,13 +2792,365 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 40, + "id": "9740d64a-e5eb-4967-a534-ca6177546465", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguage...average_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaN...NaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language ... average_ticket_basket \\\n", + "0 True NaN NaN NaN ... NaN \n", + "1 True NaN NaN NaN ... NaN \n", + "2 True NaN NaN NaN ... NaN \n", + "3 True NaN NaN NaN ... NaN \n", + "4 False NaN NaN NaN ... NaN \n", + "\n", + " total_price purchase_count first_buying_date country age tenant_id \\\n", + "0 NaN 0 NaT fr NaN 1311 \n", + "1 NaN 0 NaT fr NaN 1311 \n", + "2 NaN 0 NaT fr NaN 1311 \n", + "3 NaN 0 NaT fr NaN 1311 \n", + "4 NaN 0 NaT NaN NaN 1311 \n", + "\n", + " nb_campaigns nb_campaigns_opened time_to_open \n", + "0 NaN NaN NaT \n", + "1 NaN NaN NaT \n", + "2 NaN NaN NaT \n", + "3 NaN NaN NaT \n", + "4 80.0 2.0 0 days 19:53:02.500000 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_customer.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
582011NaN2False2TrueFalseNaNNaNNaNNaN2023-11-08 03:20:0745.012547757.030122330831-67.79096913.751531.9560878821221.56414722013-06-10 10:37:58+00:00frNaN1311NaNNaNNaT
\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "58201 1 NaN 2 False 2 True \n", + "\n", + " opt_in structure_id profession language mcp_contact_id \\\n", + "58201 False NaN NaN NaN NaN \n", + "\n", + " last_buying_date max_price ticket_sum average_price fidelity \\\n", + "58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n", + "\n", + " average_purchase_delay average_price_basket average_ticket_basket \\\n", + "58201 -67.790969 13.75153 1.956087 \n", + "\n", + " total_price purchase_count first_buying_date country age \\\n", + "58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n", + "\n", + " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n", + "58201 1311 NaN NaN NaT " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option('display.max_columns', None)\n", + "\n", + "\n", + "df1_customer[df1_customer['customer_id'] == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [], "source": [ "# Fusion avec KPI liés au comportement d'achat\n", - "# df1_customer_product = pd.merge(df1_customer, df1_products_purchased, on = 'customer_id', how = 'left')" + "# df1_customer_product = pd.merge(df1_products_purchased_reduced, df1_products_purchased, on = 'customer_id', how = 'outer')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f", + "metadata": {}, + "outputs": [], + "source": [ + "# df1_customer_product" ] } ], diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index 6af213e..344dd7b 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ @@ -151,7 +151,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -2731,7 +2731,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 8, "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a", "metadata": {}, "outputs": [ @@ -2739,7 +2739,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -2795,11 +2795,9 @@ }, { "cell_type": "code", - "execution_count": 57, - "id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d", - "metadata": { - "scrolled": true - }, + "execution_count": 10, + "id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c", + "metadata": {}, "outputs": [ { "data": { @@ -2831,226 +2829,42 @@ " \n", " \n", " \n", - " 8793\n", - " 4584599\n", - " 1\n", - " consentement optin jeune public\n", + " 0\n", + " 1184824\n", + " 645400\n", + " DDCP PROMO Réseau livres\n", " False\n", " manual_static_filter\n", " \n", " \n", - " 13249\n", - " 4567465\n", - " 1\n", - " DDCP rentrée culturelle 2023\n", + " 1\n", + " 210571\n", + " 2412\n", + " DDCP PROMO Réseau livres\n", " False\n", " manual_static_filter\n", " \n", " \n", - " 21424\n", - " 4544805\n", - " 1\n", - " spectateurs cine dimanche_cine concert_2122\n", + " 2\n", + " 210572\n", + " 4536\n", + " DDCP PROMO Réseau livres\n", " False\n", " manual_static_filter\n", " \n", " \n", - " 21665\n", - " 4544911\n", - " 1\n", - " DDCP Cine 2023\n", + " 3\n", + " 210573\n", + " 6736\n", + " DDCP PROMO Réseau livres\n", " False\n", " manual_static_filter\n", " \n", " \n", - " 22811\n", - " 4545766\n", - " 1\n", - " DDCP OLBJ! 2023\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 57305\n", - " 4457909\n", - " 1\n", - " ddcp_promo_visiteurs occasionnels_musee_8mois\n", - " False\n", - " manual_dynamic_filter\n", - " \n", - " \n", - " 58843\n", - " 3688872\n", - " 1\n", - " DDCP promo livemag\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 66813\n", - " 4313646\n", - " 1\n", - " DDCP spectateurs Classique mais pas que 2022\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 68367\n", - " 4547662\n", - " 1\n", - " ddcp_promo_musee_au moins 3 achats_dps8mois\n", - " False\n", - " manual_dynamic_filter\n", - " \n", - " \n", - " 77320\n", - " 4285520\n", - " 1\n", - " DDCP spectateurs Iminente\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 84350\n", - " 4037805\n", - " 1\n", - " DDCP spectateurs Marseille Jazz 18-19-21\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 85383\n", - " 4569504\n", - " 1\n", - " DDCP rendez-vous de septembre offre spéciale\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 92868\n", - " 4433064\n", - " 1\n", - " ddcp_promo_plein air_ateliers_jardins\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 99670\n", - " 3858684\n", - " 1\n", - " Acid Arab\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 105477\n", - " 4321810\n", - " 1\n", - " Arenametrix_bascule tel vers sib\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 169513\n", - " 3697992\n", - " 1\n", - " ddcp_achats billets nb dps 19052021\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 214421\n", - " 2925324\n", - " 1\n", - " consentement optout scolaires\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 234546\n", - " 4575957\n", - " 1\n", - " Portrait de Leila shahid\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 259808\n", - " 3722259\n", - " 1\n", - " consentement optin b2b\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 274380\n", - " 4510423\n", - " 1\n", - " DDCP_marseille_jazz_2023\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 307511\n", - " 5174466\n", - " 1\n", - " ddcp actoral 21-22\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 357509\n", - " 4442526\n", - " 1\n", - " ddcp musique barvalo\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 392920\n", - " 4390642\n", - " 1\n", - " ddcp_md_promo_spectateurs theatre contempo\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 449620\n", - " 4411897\n", - " 1\n", - " FORMATION _ acheteurs optin last year\n", - " False\n", - " manual_dynamic_filter\n", - " \n", - " \n", - " 503809\n", - " 4734591\n", - " 1\n", - " consentement optin mediation specialisee\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 651222\n", - " 3554426\n", - " 1\n", - " consentement optin b2c\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 654246\n", - " 5182212\n", - " 1\n", - " DDCP spectateurs Festival de Marseille 2023\n", - " False\n", - " manual_static_filter\n", - " \n", - " \n", - " 654395\n", - " 5182456\n", - " 1\n", - " rencontres_echelle_spectateurs_2021_2023\n", + " 4\n", + " 210574\n", + " 38210\n", + " DDCP PROMO Réseau livres\n", " False\n", " manual_static_filter\n", " \n", @@ -3059,80 +2873,241 @@ "" ], "text/plain": [ - " id customer_id target_name \\\n", - "8793 4584599 1 consentement optin jeune public \n", - "13249 4567465 1 DDCP rentrée culturelle 2023 \n", - "21424 4544805 1 spectateurs cine dimanche_cine concert_2122 \n", - "21665 4544911 1 DDCP Cine 2023 \n", - "22811 4545766 1 DDCP OLBJ! 2023 \n", - "57305 4457909 1 ddcp_promo_visiteurs occasionnels_musee_8mois \n", - "58843 3688872 1 DDCP promo livemag \n", - "66813 4313646 1 DDCP spectateurs Classique mais pas que 2022 \n", - "68367 4547662 1 ddcp_promo_musee_au moins 3 achats_dps8mois \n", - "77320 4285520 1 DDCP spectateurs Iminente \n", - "84350 4037805 1 DDCP spectateurs Marseille Jazz 18-19-21 \n", - "85383 4569504 1 DDCP rendez-vous de septembre offre spéciale \n", - "92868 4433064 1 ddcp_promo_plein air_ateliers_jardins \n", - "99670 3858684 1 Acid Arab \n", - "105477 4321810 1 Arenametrix_bascule tel vers sib \n", - "169513 3697992 1 ddcp_achats billets nb dps 19052021 \n", - "214421 2925324 1 consentement optout scolaires \n", - "234546 4575957 1 Portrait de Leila shahid \n", - "259808 3722259 1 consentement optin b2b \n", - "274380 4510423 1 DDCP_marseille_jazz_2023 \n", - "307511 5174466 1 ddcp actoral 21-22 \n", - "357509 4442526 1 ddcp musique barvalo \n", - "392920 4390642 1 ddcp_md_promo_spectateurs theatre contempo \n", - "449620 4411897 1 FORMATION _ acheteurs optin last year \n", - "503809 4734591 1 consentement optin mediation specialisee \n", - "651222 3554426 1 consentement optin b2c \n", - "654246 5182212 1 DDCP spectateurs Festival de Marseille 2023 \n", - "654395 5182456 1 rencontres_echelle_spectateurs_2021_2023 \n", + " id customer_id target_name target_type_is_import \\\n", + "0 1184824 645400 DDCP PROMO Réseau livres False \n", + "1 210571 2412 DDCP PROMO Réseau livres False \n", + "2 210572 4536 DDCP PROMO Réseau livres False \n", + "3 210573 6736 DDCP PROMO Réseau livres False \n", + "4 210574 38210 DDCP PROMO Réseau livres False \n", "\n", - " target_type_is_import target_type_name \n", - "8793 False manual_static_filter \n", - "13249 False manual_static_filter \n", - "21424 False manual_static_filter \n", - "21665 False manual_static_filter \n", - "22811 False manual_static_filter \n", - "57305 False manual_dynamic_filter \n", - "58843 False manual_static_filter \n", - "66813 False manual_static_filter \n", - "68367 False manual_dynamic_filter \n", - "77320 False manual_static_filter \n", - "84350 False manual_static_filter \n", - "85383 False manual_static_filter \n", - "92868 False manual_static_filter \n", - "99670 False manual_static_filter \n", - "105477 False manual_static_filter \n", - "169513 False manual_static_filter \n", - "214421 False manual_static_filter \n", - "234546 False manual_static_filter \n", - "259808 False manual_static_filter \n", - "274380 False manual_static_filter \n", - "307511 False manual_static_filter \n", - "357509 False manual_static_filter \n", - "392920 False manual_static_filter \n", - "449620 False manual_dynamic_filter \n", - "503809 False manual_static_filter \n", - "651222 False manual_static_filter \n", - "654246 False manual_static_filter \n", - "654395 False manual_static_filter " + " target_type_name \n", + "0 manual_static_filter \n", + "1 manual_static_filter \n", + "2 manual_static_filter \n", + "3 manual_static_filter \n", + "4 manual_static_filter " ] }, - "execution_count": 57, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1_targets_full[df1_targets_full['customer_id'] == 1]" + "df1_targets_full.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Catégorisation des target_name\n", + "import pandas as pd\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.probability import FreqDist\n", + "\n", + "# Téléchargement des ressources nécessaires\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mots les plus fréquents:\n", + "consentement: 550777\n", + "optin: 463579\n", + "jeune: 155103\n", + "public: 155103\n", + "mediation: 150001\n" + ] + } + ], + "source": [ + "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n", + "def preprocess_text(texte):\n", + " # Concaténation des éléments de la liste en une seule chaîne de caractères\n", + " texte_concat = ' '.join(texte)\n", + " \n", + " # Tokenisation des mots\n", + " tokens = word_tokenize(texte_concat.lower())\n", + " \n", + " # Suppression des mots vides (stopwords)\n", + " stop_words = set(stopwords.words('french'))\n", + " filtered_tokens = [word for word in tokens if word not in stop_words]\n", + " \n", + " # Lemmatisation des mots\n", + " lemmatizer = WordNetLemmatizer()\n", + " lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n", + " \n", + " return lemmatized_tokens\n", + "\n", + "\n", + "# Appliquer le prétraitement à la colonne de texte\n", + "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n", + "\n", + "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n", + "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n", + "\n", + "# Calculer la fréquence des mots\n", + "freq_dist = FreqDist(all_words)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mots les plus fréquents:\n", + "consentement: 550777\n", + "optin: 463579\n", + "jeune: 155103\n", + "public: 155103\n", + "mediation: 150001\n", + "specialisee: 150001\n", + "b2c: 143432\n", + "optout: 97683\n", + "newsletter: 56022\n", + "(: 46084\n", + "): 46084\n", + "inscrits: 42296\n", + "nl: 42294\n", + "générale: 41037\n", + "generale: 40950\n" + ] + } + ], + "source": [ + "# Affichage des mots les plus fréquents\n", + "print(\"Mots les plus fréquents:\")\n", + "for mot, freq in freq_dist.most_common(15):\n", + " print(f\"{mot}: {freq}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8bffef87-542e-4775-bc7c-2c0323fda581", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " texte \\\n", + "0 Le chat noir mange une souris. \n", + "1 Le chien blanc aboie. \n", + "\n", + " texte_preprocessed \n", + "0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n", + "1 [e, h, i, e, b, a, a, b, o, i, e, .] \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# Téléchargement des ressources nécessaires\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "# Création de la DataFrame d'exemple\n", + "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n", + "df = pd.DataFrame(data)\n", + "\n", + "# Fonction pour prétraiter le texte\n", + "def preprocess_text(texte):\n", + " # Concaténation des éléments de la liste en une seule chaîne de caractères\n", + " texte_concat = ' '.join(texte)\n", + " \n", + " # Tokenisation des mots\n", + " tokens = word_tokenize(texte_concat.lower())\n", + " \n", + " # Suppression des mots vides (stopwords)\n", + " stop_words = set(stopwords.words('french'))\n", + " filtered_tokens = [word for word in tokens if word not in stop_words]\n", + " \n", + " # Lemmatisation des mots\n", + " lemmatizer = WordNetLemmatizer()\n", + " lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n", + " \n", + " return lemmatized_tokens\n", + "\n", + "# Appliquer la fonction de prétraitement à la colonne de texte\n", + "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n", + "\n", + "# Afficher le résultat\n", + "print(df)\n" ] }, { "cell_type": "markdown", "id": "2f665824-a026-4acd-8358-b408a61854b4", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## Campaign area" ] @@ -3902,9 +3877,7 @@ { "cell_type": "markdown", "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## Exploration variables" ] From c682d436b89bb8c6470674dde080a8283014ca3f Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 13:23:17 +0000 Subject: [PATCH 2/8] =?UTF-8?q?Ajout=20canal=20de=20vente=20en=20ligne=20e?= =?UTF-8?q?t=20nombre=20tickets=20achet=C3=A9=20en=20lignet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 0_Cleaning_and_merge.ipynb | 1036 ++++++++++++------------------------ 1 file changed, 331 insertions(+), 705 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 99d5ea7..5085051 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -79,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_8302/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -205,6 +205,7 @@ " # Base des fournisseurs\n", " suppliers = suppliers[['id', 'name']]\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n", "\n", " # Base des types de billets\n", " type_ofs = type_ofs[['id', 'name', 'children']]\n", @@ -242,17 +243,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/1591303091.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", - "/tmp/ipykernel_15815/1591303091.py:9: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", - "/tmp/ipykernel_15815/1591303091.py:13: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3092893564.py:10: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n", + "/tmp/ipykernel_8302/3092893564.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -433,7 +440,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/3848597476.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -647,19 +654,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_15815/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -847,7 +854,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15815/3700263836.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_8302/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1971,7 +1978,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 33, "id": "3d8b0875-b409-44ce-b688-d9d6758782d3", "metadata": {}, "outputs": [ @@ -2261,7 +2268,7 @@ "[1826672 rows x 14 columns]" ] }, - "execution_count": 53, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2273,7 +2280,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe", "metadata": {}, "outputs": [], @@ -2284,38 +2291,51 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 74, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], "source": [ "def tickets_kpi_function(tickets_information = None):\n", + "\n", " tickets_information_copy = tickets_information.copy()\n", - " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n", - " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n", + "\n", + " # Dummy : Canal de vente en ligne\n", + " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", + " tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", + "\n", + " # Proportion de vente en ligne\n", + " prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n", + " prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n", + " \n", + " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n", " .groupby([ 'customer_id']) # 'event_type_id',\n", " .agg({'ticket_id': 'count', \n", " 'amount' : 'sum',\n", " 'supplier_name': 'nunique',\n", - " 'purchase_date_max' : 'max',\n", - " 'purchase_date' : 'min'})\n", + " 'vente_internet' : 'max',\n", + " 'purchase_date' : ['min', 'max']})\n", " .reset_index()\n", " )\n", + " tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n", " \n", - " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n", - " 'amount' : 'total_amount',\n", - " 'supplier_name' : 'nb_suppliers', \n", - " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n", + " tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n", + " 'amount_sum' : 'total_amount',\n", + " 'supplier_name_nunique' : 'nb_suppliers', \n", + " 'customer_id_' : 'customer_id'}, inplace = True)\n", " \n", " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", - " \n", + "\n", + " tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = 'customer_id', how = 'left')\n", + " tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n", + " \n", " return tickets_kpi\n", " " ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 75, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -2325,8 +2345,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", + "execution_count": 76, + "id": "c78f5ade-c721-49d9-a474-73c217686ed1", "metadata": {}, "outputs": [ { @@ -2354,9 +2374,11 @@ " nb_tickets\n", " total_amount\n", " nb_suppliers\n", - " purchase_date_max\n", + " vente_internet_max\n", " purchase_date_min\n", + " purchase_date_max\n", " time_between_purchase\n", + " nb_tickets_internet\n", " \n", " \n", " \n", @@ -2366,407 +2388,330 @@ " 1256574\n", " 8830567.5\n", " 7\n", - " 2023-11-08 15:59:45+00:00\n", + " 1\n", " 2013-06-10 10:37:58+00:00\n", + " 2023-11-08 15:59:45+00:00\n", " 3803 days 05:21:47\n", + " 3053.0\n", " \n", " \n", - " 3615\n", - " 6733\n", - " 35527\n", - " 1188.0\n", - " 4\n", - " 2023-11-03 09:42:40+00:00\n", - " 2015-09-09 13:48:38+00:00\n", - " 2976 days 19:54:02\n", + " 1\n", + " 2\n", + " 307\n", + " 0.0\n", + " 1\n", + " 0\n", + " 2018-04-07 12:55:07+00:00\n", + " 2020-03-08 12:06:43+00:00\n", + " 700 days 23:11:36\n", + " 0.0\n", " \n", " \n", - " 39\n", - " 41\n", - " 16263\n", - " 37642.0\n", + " 2\n", + " 3\n", " 6\n", - " 2023-10-25 09:13:16+00:00\n", - " 2014-01-23 16:56:57+00:00\n", - " 3561 days 16:16:19\n", - " \n", - " \n", - " 11\n", - " 12\n", - " 5871\n", - " 38767.0\n", - " 2\n", - " 2023-11-04 13:46:59+00:00\n", - " 2018-04-04 07:46:31+00:00\n", - " 2040 days 06:00:28\n", - " \n", - " \n", - " 32809\n", - " 63488\n", - " 5851\n", - " 64350.0\n", + " 110.0\n", " 1\n", - " 2022-08-25 13:08:38+00:00\n", - " 2020-08-18 08:32:57+00:00\n", - " 737 days 04:35:41\n", - " \n", - " \n", - " 3708\n", - " 6916\n", - " 5482\n", - " 51489.5\n", - " 2\n", - " 2021-08-26 12:49:17+00:00\n", - " 2018-03-26 11:13:43+00:00\n", - " 1249 days 01:35:34\n", - " \n", - " \n", - " 32616\n", - " 63194\n", - " 4507\n", - " 13232.0\n", - " 3\n", - " 2022-09-07 12:55:33+00:00\n", - " 2017-11-28 13:52:15+00:00\n", - " 1743 days 23:03:18\n", - " \n", - " \n", - " 78\n", - " 81\n", - " 3562\n", - " 38746.0\n", " 1\n", - " 2022-08-30 11:51:34+00:00\n", - " 2017-01-05 13:04:58+00:00\n", - " 2062 days 22:46:36\n", + " 2019-09-19 15:15:01+00:00\n", + " 2023-09-27 09:13:09+00:00\n", + " 1468 days 17:58:08\n", + " 6.0\n", " \n", " \n", - " 35295\n", - " 84002\n", - " 3403\n", - " 19830.0\n", + " 3\n", " 4\n", - " 2023-11-06 15:59:22+00:00\n", - " 2021-05-28 10:22:33+00:00\n", - " 892 days 05:36:49\n", - " \n", - " \n", - " 3377\n", - " 5618\n", - " 3294\n", - " 31684.5\n", - " 1\n", - " 2022-02-24 07:47:20+00:00\n", - " 2018-10-25 11:04:24+00:00\n", - " 1217 days 20:42:56\n", - " \n", - " \n", - " 30011\n", - " 59259\n", - " 2591\n", - " 4350.0\n", - " 3\n", - " 2023-06-12 14:05:19+00:00\n", - " 2019-11-25 08:52:48+00:00\n", - " 1295 days 05:12:31\n", - " \n", - " \n", - " 34937\n", - " 74876\n", - " 2571\n", - " 2600.0\n", - " 2\n", - " 2023-10-02 08:13:05+00:00\n", - " 2018-02-08 12:54:01+00:00\n", - " 2061 days 19:19:04\n", - " \n", - " \n", - " 270\n", - " 295\n", - " 2570\n", - " 17678.5\n", - " 6\n", - " 2023-10-16 10:19:22+00:00\n", - " 2014-01-24 15:16:17+00:00\n", - " 3551 days 19:03:05\n", - " \n", - " \n", - " 866\n", - " 1221\n", - " 2320\n", - " 9652.0\n", - " 2\n", - " 2022-09-19 12:55:15+00:00\n", - " 2017-03-29 08:00:09+00:00\n", - " 2000 days 04:55:06\n", - " \n", - " \n", - " 1022\n", - " 1429\n", - " 2249\n", - " 3500.0\n", " 4\n", - " 2023-11-06 08:30:37+00:00\n", - " 2014-12-03 14:56:38+00:00\n", - " 3259 days 17:33:59\n", - " \n", - " \n", - " 3922\n", - " 7249\n", - " 1827\n", - " 13385.0\n", + " 41.0\n", " 1\n", - " 2021-10-26 12:28:40+00:00\n", - " 2019-05-07 12:34:56+00:00\n", - " 902 days 23:53:44\n", - " \n", - " \n", - " 54425\n", - " 1070539\n", - " 1800\n", - " 19800.0\n", " 1\n", - " 2022-07-25 12:49:27+00:00\n", - " 2022-05-02 16:09:03+00:00\n", - " 83 days 20:40:24\n", + " 2019-09-19 15:43:49+00:00\n", + " 2021-09-02 18:42:19+00:00\n", + " 714 days 02:58:30\n", + " 4.0\n", " \n", " \n", - " 69520\n", - " 1216801\n", - " 1623\n", - " 12562.0\n", + " 4\n", + " 5\n", " 2\n", - " 2023-09-29 16:34:38+00:00\n", - " 2023-06-16 14:16:04+00:00\n", - " 105 days 02:18:34\n", - " \n", - " \n", - " 30056\n", - " 59330\n", - " 1551\n", - " 0.0\n", + " 19.0\n", " 1\n", - " 2023-11-06 10:22:14+00:00\n", - " 2018-02-02 08:53:51+00:00\n", - " 2103 days 01:28:23\n", - " \n", - " \n", - " 3243\n", - " 5441\n", - " 1544\n", - " 14133.0\n", - " 2\n", - " 2022-09-22 08:21:47+00:00\n", - " 2017-12-14 12:50:23+00:00\n", - " 1742 days 19:31:24\n", - " \n", - " \n", - " 55195\n", - " 1084435\n", - " 1500\n", - " 16500.0\n", " 1\n", - " 2022-09-27 14:32:13+00:00\n", - " 2022-05-18 08:04:41+00:00\n", - " 132 days 06:27:32\n", + " 2019-09-19 15:45:36+00:00\n", + " 2019-09-19 15:45:36+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", " \n", " \n", - " 28983\n", - " 57816\n", - " 1485\n", - " 0.0\n", - " 2\n", - " 2023-05-22 07:30:55+00:00\n", - " 2019-01-21 14:19:18+00:00\n", - " 1581 days 17:11:37\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 2231\n", - " 2942\n", - " 1307\n", - " 100.0\n", - " 2\n", - " 2023-06-29 09:33:58+00:00\n", - " 2017-10-25 15:06:58+00:00\n", - " 2072 days 18:27:00\n", + " 73513\n", + " 1256133\n", + " 3\n", + " 33.0\n", + " 1\n", + " 1\n", + " 2023-11-08 16:51:19+00:00\n", + " 2023-11-08 16:51:19+00:00\n", + " 0 days 00:00:00\n", + " 3.0\n", " \n", " \n", - " 23\n", - " 24\n", - " 1266\n", - " 0.0\n", - " 2\n", - " 2023-10-19 07:20:48+00:00\n", - " 2015-09-30 16:07:52+00:00\n", - " 2940 days 15:12:56\n", - " \n", - " \n", - " 4513\n", - " 9592\n", - " 1211\n", - " 62.0\n", + " 73514\n", + " 1256134\n", " 4\n", - " 2023-10-17 09:39:40+00:00\n", - " 2018-02-25 07:17:19+00:00\n", - " 2060 days 02:22:21\n", - " \n", - " \n", - " 2936\n", - " 5059\n", - " 1186\n", - " 6308.0\n", - " 3\n", - " 2023-05-22 13:41:22+00:00\n", - " 2018-02-01 11:16:51+00:00\n", - " 1936 days 02:24:31\n", - " \n", - " \n", - " 11484\n", - " 25100\n", - " 1123\n", - " 0.0\n", + " 44.0\n", " 1\n", - " 2021-07-13 07:39:57+00:00\n", - " 2015-12-21 15:38:05+00:00\n", - " 2030 days 16:01:52\n", - " \n", - " \n", - " 934\n", - " 1326\n", - " 1098\n", - " 798.0\n", - " 3\n", - " 2023-02-01 08:39:45+00:00\n", - " 2018-02-13 13:13:48+00:00\n", - " 1813 days 19:25:57\n", - " \n", - " \n", - " 30156\n", - " 59490\n", - " 1088\n", - " 0.0\n", " 1\n", - " 2023-10-05 08:23:50+00:00\n", - " 2019-12-06 12:59:20+00:00\n", - " 1398 days 19:24:30\n", + " 2023-11-08 17:17:51+00:00\n", + " 2023-11-08 17:17:51+00:00\n", + " 0 days 00:00:00\n", + " 4.0\n", " \n", " \n", - " 36478\n", - " 251268\n", - " 1086\n", - " 0.0\n", + " 73515\n", + " 1256135\n", + " 1\n", + " 11.0\n", + " 1\n", + " 1\n", + " 2023-11-08 17:23:54+00:00\n", + " 2023-11-08 17:23:54+00:00\n", + " 0 days 00:00:00\n", + " 1.0\n", + " \n", + " \n", + " 73516\n", + " 1256136\n", " 2\n", - " 2023-06-30 07:22:46+00:00\n", - " 2018-02-02 09:06:22+00:00\n", - " 1973 days 22:16:24\n", + " 22.0\n", + " 1\n", + " 1\n", + " 2023-11-08 18:32:18+00:00\n", + " 2023-11-08 18:32:18+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", + " \n", + " \n", + " 73517\n", + " 1256137\n", + " 2\n", + " 22.0\n", + " 1\n", + " 1\n", + " 2023-11-08 19:30:28+00:00\n", + " 2023-11-08 19:30:28+00:00\n", + " 0 days 00:00:00\n", + " 2.0\n", " \n", " \n", "\n", + "

73518 rows × 9 columns

\n", "" ], "text/plain": [ " customer_id nb_tickets total_amount nb_suppliers \\\n", "0 1 1256574 8830567.5 7 \n", - "3615 6733 35527 1188.0 4 \n", - "39 41 16263 37642.0 6 \n", - "11 12 5871 38767.0 2 \n", - "32809 63488 5851 64350.0 1 \n", - "3708 6916 5482 51489.5 2 \n", - "32616 63194 4507 13232.0 3 \n", - "78 81 3562 38746.0 1 \n", - "35295 84002 3403 19830.0 4 \n", - "3377 5618 3294 31684.5 1 \n", - "30011 59259 2591 4350.0 3 \n", - "34937 74876 2571 2600.0 2 \n", - "270 295 2570 17678.5 6 \n", - "866 1221 2320 9652.0 2 \n", - "1022 1429 2249 3500.0 4 \n", - "3922 7249 1827 13385.0 1 \n", - "54425 1070539 1800 19800.0 1 \n", - "69520 1216801 1623 12562.0 2 \n", - "30056 59330 1551 0.0 1 \n", - "3243 5441 1544 14133.0 2 \n", - "55195 1084435 1500 16500.0 1 \n", - "28983 57816 1485 0.0 2 \n", - "2231 2942 1307 100.0 2 \n", - "23 24 1266 0.0 2 \n", - "4513 9592 1211 62.0 4 \n", - "2936 5059 1186 6308.0 3 \n", - "11484 25100 1123 0.0 1 \n", - "934 1326 1098 798.0 3 \n", - "30156 59490 1088 0.0 1 \n", - "36478 251268 1086 0.0 2 \n", + "1 2 307 0.0 1 \n", + "2 3 6 110.0 1 \n", + "3 4 4 41.0 1 \n", + "4 5 2 19.0 1 \n", + "... ... ... ... ... \n", + "73513 1256133 3 33.0 1 \n", + "73514 1256134 4 44.0 1 \n", + "73515 1256135 1 11.0 1 \n", + "73516 1256136 2 22.0 1 \n", + "73517 1256137 2 22.0 1 \n", "\n", - " purchase_date_max purchase_date_min \\\n", - "0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n", - "3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n", - "39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n", - "11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n", - "32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n", - "3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n", - "32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n", - "78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n", - "35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n", - "3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n", - "30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n", - "34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n", - "270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n", - "866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n", - "1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n", - "3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n", - "54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n", - "69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n", - "30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n", - "3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n", - "55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n", - "28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n", - "2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n", - "23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n", - "4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n", - "2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n", - "11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n", - "934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n", - "30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n", - "36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n", + " vente_internet_max purchase_date_min purchase_date_max \\\n", + "0 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n", + "1 0 2018-04-07 12:55:07+00:00 2020-03-08 12:06:43+00:00 \n", + "2 1 2019-09-19 15:15:01+00:00 2023-09-27 09:13:09+00:00 \n", + "3 1 2019-09-19 15:43:49+00:00 2021-09-02 18:42:19+00:00 \n", + "4 1 2019-09-19 15:45:36+00:00 2019-09-19 15:45:36+00:00 \n", + "... ... ... ... \n", + "73513 1 2023-11-08 16:51:19+00:00 2023-11-08 16:51:19+00:00 \n", + "73514 1 2023-11-08 17:17:51+00:00 2023-11-08 17:17:51+00:00 \n", + "73515 1 2023-11-08 17:23:54+00:00 2023-11-08 17:23:54+00:00 \n", + "73516 1 2023-11-08 18:32:18+00:00 2023-11-08 18:32:18+00:00 \n", + "73517 1 2023-11-08 19:30:28+00:00 2023-11-08 19:30:28+00:00 \n", "\n", - " time_between_purchase \n", - "0 3803 days 05:21:47 \n", - "3615 2976 days 19:54:02 \n", - "39 3561 days 16:16:19 \n", - "11 2040 days 06:00:28 \n", - "32809 737 days 04:35:41 \n", - "3708 1249 days 01:35:34 \n", - "32616 1743 days 23:03:18 \n", - "78 2062 days 22:46:36 \n", - "35295 892 days 05:36:49 \n", - "3377 1217 days 20:42:56 \n", - "30011 1295 days 05:12:31 \n", - "34937 2061 days 19:19:04 \n", - "270 3551 days 19:03:05 \n", - "866 2000 days 04:55:06 \n", - "1022 3259 days 17:33:59 \n", - "3922 902 days 23:53:44 \n", - "54425 83 days 20:40:24 \n", - "69520 105 days 02:18:34 \n", - "30056 2103 days 01:28:23 \n", - "3243 1742 days 19:31:24 \n", - "55195 132 days 06:27:32 \n", - "28983 1581 days 17:11:37 \n", - "2231 2072 days 18:27:00 \n", - "23 2940 days 15:12:56 \n", - "4513 2060 days 02:22:21 \n", - "2936 1936 days 02:24:31 \n", - "11484 2030 days 16:01:52 \n", - "934 1813 days 19:25:57 \n", - "30156 1398 days 19:24:30 \n", - "36478 1973 days 22:16:24 " + " time_between_purchase nb_tickets_internet \n", + "0 3803 days 05:21:47 3053.0 \n", + "1 700 days 23:11:36 0.0 \n", + "2 1468 days 17:58:08 6.0 \n", + "3 714 days 02:58:30 4.0 \n", + "4 0 days 00:00:00 2.0 \n", + "... ... ... \n", + "73513 0 days 00:00:00 3.0 \n", + "73514 0 days 00:00:00 4.0 \n", + "73515 0 days 00:00:00 1.0 \n", + "73516 0 days 00:00:00 2.0 \n", + "73517 0 days 00:00:00 2.0 \n", + "\n", + "[73518 rows x 9 columns]" ] }, - "execution_count": 52, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "df1_tickets_kpi" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "de1ecf15-1aa1-4aa2-9467-8ad8e9be5856", + "metadata": {}, + "outputs": [], + "source": [ + " df_tickets_information_copy = df1_products_purchased_reduced.copy()\n", + "\n", + " # Dummy : Canal de vente en ligne\n", + " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", + " df_tickets_information_copy['vente_internet'] = df_tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", + "\n", + " # Proportion de vente en ligne\n", + " prop_vente_internet = df_tickets_information_copy[df_tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "9cd36178-11dc-409c-b148-fb1d208c2faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idticket_id
013053
136
244
352
462
.........
5674412561333
5674512561344
5674612561351
5674712561362
5674812561372
\n", + "

56749 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " customer_id ticket_id\n", + "0 1 3053\n", + "1 3 6\n", + "2 4 4\n", + "3 5 2\n", + "4 6 2\n", + "... ... ...\n", + "56744 1256133 3\n", + "56745 1256134 4\n", + "56746 1256135 1\n", + "56747 1256136 2\n", + "56748 1256137 2\n", + "\n", + "[56749 rows x 2 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prop_vente_internet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", + "metadata": {}, + "outputs": [], "source": [ "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)" ] @@ -2781,7 +2726,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], @@ -2792,339 +2737,20 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "9740d64a-e5eb-4967-a534-ca6177546465", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguage...average_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaN...NaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "0 12751 NaN 2 False 1 True \n", - "1 12825 NaN 2 False 2 True \n", - "2 11261 NaN 2 False 1 True \n", - "3 13071 NaN 2 False 2 True \n", - "4 653061 NaN 10 False 2 True \n", - "\n", - " opt_in structure_id profession language ... average_ticket_basket \\\n", - "0 True NaN NaN NaN ... NaN \n", - "1 True NaN NaN NaN ... NaN \n", - "2 True NaN NaN NaN ... NaN \n", - "3 True NaN NaN NaN ... NaN \n", - "4 False NaN NaN NaN ... NaN \n", - "\n", - " total_price purchase_count first_buying_date country age tenant_id \\\n", - "0 NaN 0 NaT fr NaN 1311 \n", - "1 NaN 0 NaT fr NaN 1311 \n", - "2 NaN 0 NaT fr NaN 1311 \n", - "3 NaN 0 NaT fr NaN 1311 \n", - "4 NaN 0 NaT NaN NaN 1311 \n", - "\n", - " nb_campaigns nb_campaigns_opened time_to_open \n", - "0 NaN NaN NaT \n", - "1 NaN NaN NaT \n", - "2 NaN NaN NaT \n", - "3 NaN NaN NaT \n", - "4 80.0 2.0 0 days 19:53:02.500000 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df1_customer.head()" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
582011NaN2False2TrueFalseNaNNaNNaNNaN2023-11-08 03:20:0745.012547757.030122330831-67.79096913.751531.9560878821221.56414722013-06-10 10:37:58+00:00frNaN1311NaNNaNNaT
\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "58201 1 NaN 2 False 2 True \n", - "\n", - " opt_in structure_id profession language mcp_contact_id \\\n", - "58201 False NaN NaN NaN NaN \n", - "\n", - " last_buying_date max_price ticket_sum average_price fidelity \\\n", - "58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n", - "\n", - " average_purchase_delay average_price_basket average_ticket_basket \\\n", - "58201 -67.790969 13.75153 1.956087 \n", - "\n", - " total_price purchase_count first_buying_date country age \\\n", - "58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n", - "\n", - " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n", - "58201 1311 NaN NaN NaT " - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pd.set_option('display.max_columns', None)\n", "\n", @@ -3134,7 +2760,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [], @@ -3145,7 +2771,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f", "metadata": {}, "outputs": [], From a83055cc48acd1001625d25327219c121b6d0102 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 13:25:12 +0000 Subject: [PATCH 3/8] Nettoyage --- 0_Cleaning_and_merge.ipynb | 687 ++++++++++++++++++++++--------------- 1 file changed, 412 insertions(+), 275 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 5085051..8b723e0 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -2345,8 +2345,8 @@ }, { "cell_type": "code", - "execution_count": 76, - "id": "c78f5ade-c721-49d9-a474-73c217686ed1", + "execution_count": 77, + "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", "metadata": {}, "outputs": [ { @@ -2395,323 +2395,460 @@ " 3053.0\n", " \n", " \n", - " 1\n", + " 3615\n", + " 6733\n", + " 35527\n", + " 1188.0\n", + " 4\n", + " 1\n", + " 2015-09-09 13:48:38+00:00\n", + " 2023-11-03 09:42:40+00:00\n", + " 2976 days 19:54:02\n", + " 30896.0\n", + " \n", + " \n", + " 39\n", + " 41\n", + " 16263\n", + " 37642.0\n", + " 6\n", + " 1\n", + " 2014-01-23 16:56:57+00:00\n", + " 2023-10-25 09:13:16+00:00\n", + " 3561 days 16:16:19\n", + " 13993.0\n", + " \n", + " \n", + " 11\n", + " 12\n", + " 5871\n", + " 38767.0\n", " 2\n", - " 307\n", + " 1\n", + " 2018-04-04 07:46:31+00:00\n", + " 2023-11-04 13:46:59+00:00\n", + " 2040 days 06:00:28\n", + " 167.0\n", + " \n", + " \n", + " 32809\n", + " 63488\n", + " 5851\n", + " 64350.0\n", + " 1\n", + " 1\n", + " 2020-08-18 08:32:57+00:00\n", + " 2022-08-25 13:08:38+00:00\n", + " 737 days 04:35:41\n", + " 5851.0\n", + " \n", + " \n", + " 3708\n", + " 6916\n", + " 5482\n", + " 51489.5\n", + " 2\n", + " 1\n", + " 2018-03-26 11:13:43+00:00\n", + " 2021-08-26 12:49:17+00:00\n", + " 1249 days 01:35:34\n", + " 5481.0\n", + " \n", + " \n", + " 32616\n", + " 63194\n", + " 4507\n", + " 13232.0\n", + " 3\n", + " 1\n", + " 2017-11-28 13:52:15+00:00\n", + " 2022-09-07 12:55:33+00:00\n", + " 1743 days 23:03:18\n", + " 826.0\n", + " \n", + " \n", + " 78\n", + " 81\n", + " 3562\n", + " 38746.0\n", + " 1\n", + " 1\n", + " 2017-01-05 13:04:58+00:00\n", + " 2022-08-30 11:51:34+00:00\n", + " 2062 days 22:46:36\n", + " 3562.0\n", + " \n", + " \n", + " 35295\n", + " 84002\n", + " 3403\n", + " 19830.0\n", + " 4\n", + " 1\n", + " 2021-05-28 10:22:33+00:00\n", + " 2023-11-06 15:59:22+00:00\n", + " 892 days 05:36:49\n", + " 869.0\n", + " \n", + " \n", + " 3377\n", + " 5618\n", + " 3294\n", + " 31684.5\n", + " 1\n", + " 1\n", + " 2018-10-25 11:04:24+00:00\n", + " 2022-02-24 07:47:20+00:00\n", + " 1217 days 20:42:56\n", + " 3294.0\n", + " \n", + " \n", + " 30011\n", + " 59259\n", + " 2591\n", + " 4350.0\n", + " 3\n", + " 1\n", + " 2019-11-25 08:52:48+00:00\n", + " 2023-06-12 14:05:19+00:00\n", + " 1295 days 05:12:31\n", + " 52.0\n", + " \n", + " \n", + " 34937\n", + " 74876\n", + " 2571\n", + " 2600.0\n", + " 2\n", + " 1\n", + " 2018-02-08 12:54:01+00:00\n", + " 2023-10-02 08:13:05+00:00\n", + " 2061 days 19:19:04\n", + " 448.0\n", + " \n", + " \n", + " 270\n", + " 295\n", + " 2570\n", + " 17678.5\n", + " 6\n", + " 1\n", + " 2014-01-24 15:16:17+00:00\n", + " 2023-10-16 10:19:22+00:00\n", + " 3551 days 19:03:05\n", + " 1479.0\n", + " \n", + " \n", + " 866\n", + " 1221\n", + " 2320\n", + " 9652.0\n", + " 2\n", + " 1\n", + " 2017-03-29 08:00:09+00:00\n", + " 2022-09-19 12:55:15+00:00\n", + " 2000 days 04:55:06\n", + " 104.0\n", + " \n", + " \n", + " 1022\n", + " 1429\n", + " 2249\n", + " 3500.0\n", + " 4\n", + " 1\n", + " 2014-12-03 14:56:38+00:00\n", + " 2023-11-06 08:30:37+00:00\n", + " 3259 days 17:33:59\n", + " 690.0\n", + " \n", + " \n", + " 3922\n", + " 7249\n", + " 1827\n", + " 13385.0\n", + " 1\n", + " 1\n", + " 2019-05-07 12:34:56+00:00\n", + " 2021-10-26 12:28:40+00:00\n", + " 902 days 23:53:44\n", + " 1827.0\n", + " \n", + " \n", + " 54425\n", + " 1070539\n", + " 1800\n", + " 19800.0\n", + " 1\n", + " 1\n", + " 2022-05-02 16:09:03+00:00\n", + " 2022-07-25 12:49:27+00:00\n", + " 83 days 20:40:24\n", + " 1800.0\n", + " \n", + " \n", + " 69520\n", + " 1216801\n", + " 1623\n", + " 12562.0\n", + " 2\n", + " 0\n", + " 2023-06-16 14:16:04+00:00\n", + " 2023-09-29 16:34:38+00:00\n", + " 105 days 02:18:34\n", + " 0.0\n", + " \n", + " \n", + " 30056\n", + " 59330\n", + " 1551\n", " 0.0\n", " 1\n", " 0\n", - " 2018-04-07 12:55:07+00:00\n", - " 2020-03-08 12:06:43+00:00\n", - " 700 days 23:11:36\n", + " 2018-02-02 08:53:51+00:00\n", + " 2023-11-06 10:22:14+00:00\n", + " 2103 days 01:28:23\n", " 0.0\n", " \n", " \n", - " 2\n", + " 3243\n", + " 5441\n", + " 1544\n", + " 14133.0\n", + " 2\n", + " 1\n", + " 2017-12-14 12:50:23+00:00\n", + " 2022-09-22 08:21:47+00:00\n", + " 1742 days 19:31:24\n", + " 1384.0\n", + " \n", + " \n", + " 55195\n", + " 1084435\n", + " 1500\n", + " 16500.0\n", + " 1\n", + " 1\n", + " 2022-05-18 08:04:41+00:00\n", + " 2022-09-27 14:32:13+00:00\n", + " 132 days 06:27:32\n", + " 1500.0\n", + " \n", + " \n", + " 28983\n", + " 57816\n", + " 1485\n", + " 0.0\n", + " 2\n", + " 1\n", + " 2019-01-21 14:19:18+00:00\n", + " 2023-05-22 07:30:55+00:00\n", + " 1581 days 17:11:37\n", + " 357.0\n", + " \n", + " \n", + " 2231\n", + " 2942\n", + " 1307\n", + " 100.0\n", + " 2\n", + " 1\n", + " 2017-10-25 15:06:58+00:00\n", + " 2023-06-29 09:33:58+00:00\n", + " 2072 days 18:27:00\n", + " 676.0\n", + " \n", + " \n", + " 23\n", + " 24\n", + " 1266\n", + " 0.0\n", + " 2\n", + " 1\n", + " 2015-09-30 16:07:52+00:00\n", + " 2023-10-19 07:20:48+00:00\n", + " 2940 days 15:12:56\n", + " 556.0\n", + " \n", + " \n", + " 4513\n", + " 9592\n", + " 1211\n", + " 62.0\n", + " 4\n", + " 1\n", + " 2018-02-25 07:17:19+00:00\n", + " 2023-10-17 09:39:40+00:00\n", + " 2060 days 02:22:21\n", + " 353.0\n", + " \n", + " \n", + " 2936\n", + " 5059\n", + " 1186\n", + " 6308.0\n", " 3\n", - " 6\n", - " 110.0\n", " 1\n", - " 1\n", - " 2019-09-19 15:15:01+00:00\n", - " 2023-09-27 09:13:09+00:00\n", - " 1468 days 17:58:08\n", - " 6.0\n", + " 2018-02-01 11:16:51+00:00\n", + " 2023-05-22 13:41:22+00:00\n", + " 1936 days 02:24:31\n", + " 1182.0\n", " \n", " \n", - " 3\n", - " 4\n", - " 4\n", - " 41.0\n", + " 11484\n", + " 25100\n", + " 1123\n", + " 0.0\n", " 1\n", " 1\n", - " 2019-09-19 15:43:49+00:00\n", - " 2021-09-02 18:42:19+00:00\n", - " 714 days 02:58:30\n", - " 4.0\n", + " 2015-12-21 15:38:05+00:00\n", + " 2021-07-13 07:39:57+00:00\n", + " 2030 days 16:01:52\n", + " 1123.0\n", " \n", " \n", - " 4\n", - " 5\n", - " 2\n", - " 19.0\n", - " 1\n", - " 1\n", - " 2019-09-19 15:45:36+00:00\n", - " 2019-09-19 15:45:36+00:00\n", - " 0 days 00:00:00\n", - " 2.0\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 73513\n", - " 1256133\n", + " 934\n", + " 1326\n", + " 1098\n", + " 798.0\n", " 3\n", - " 33.0\n", " 1\n", - " 1\n", - " 2023-11-08 16:51:19+00:00\n", - " 2023-11-08 16:51:19+00:00\n", - " 0 days 00:00:00\n", - " 3.0\n", + " 2018-02-13 13:13:48+00:00\n", + " 2023-02-01 08:39:45+00:00\n", + " 1813 days 19:25:57\n", + " 266.0\n", " \n", " \n", - " 73514\n", - " 1256134\n", - " 4\n", - " 44.0\n", + " 30156\n", + " 59490\n", + " 1088\n", + " 0.0\n", " 1\n", - " 1\n", - " 2023-11-08 17:17:51+00:00\n", - " 2023-11-08 17:17:51+00:00\n", - " 0 days 00:00:00\n", - " 4.0\n", + " 0\n", + " 2019-12-06 12:59:20+00:00\n", + " 2023-10-05 08:23:50+00:00\n", + " 1398 days 19:24:30\n", + " 0.0\n", " \n", " \n", - " 73515\n", - " 1256135\n", - " 1\n", - " 11.0\n", - " 1\n", - " 1\n", - " 2023-11-08 17:23:54+00:00\n", - " 2023-11-08 17:23:54+00:00\n", - " 0 days 00:00:00\n", - " 1.0\n", - " \n", - " \n", - " 73516\n", - " 1256136\n", + " 36478\n", + " 251268\n", + " 1086\n", + " 0.0\n", " 2\n", - " 22.0\n", " 1\n", - " 1\n", - " 2023-11-08 18:32:18+00:00\n", - " 2023-11-08 18:32:18+00:00\n", - " 0 days 00:00:00\n", - " 2.0\n", - " \n", - " \n", - " 73517\n", - " 1256137\n", - " 2\n", - " 22.0\n", - " 1\n", - " 1\n", - " 2023-11-08 19:30:28+00:00\n", - " 2023-11-08 19:30:28+00:00\n", - " 0 days 00:00:00\n", - " 2.0\n", + " 2018-02-02 09:06:22+00:00\n", + " 2023-06-30 07:22:46+00:00\n", + " 1973 days 22:16:24\n", + " 279.0\n", " \n", " \n", "\n", - "

73518 rows × 9 columns

\n", "" ], "text/plain": [ " customer_id nb_tickets total_amount nb_suppliers \\\n", "0 1 1256574 8830567.5 7 \n", - "1 2 307 0.0 1 \n", - "2 3 6 110.0 1 \n", - "3 4 4 41.0 1 \n", - "4 5 2 19.0 1 \n", - "... ... ... ... ... \n", - "73513 1256133 3 33.0 1 \n", - "73514 1256134 4 44.0 1 \n", - "73515 1256135 1 11.0 1 \n", - "73516 1256136 2 22.0 1 \n", - "73517 1256137 2 22.0 1 \n", + "3615 6733 35527 1188.0 4 \n", + "39 41 16263 37642.0 6 \n", + "11 12 5871 38767.0 2 \n", + "32809 63488 5851 64350.0 1 \n", + "3708 6916 5482 51489.5 2 \n", + "32616 63194 4507 13232.0 3 \n", + "78 81 3562 38746.0 1 \n", + "35295 84002 3403 19830.0 4 \n", + "3377 5618 3294 31684.5 1 \n", + "30011 59259 2591 4350.0 3 \n", + "34937 74876 2571 2600.0 2 \n", + "270 295 2570 17678.5 6 \n", + "866 1221 2320 9652.0 2 \n", + "1022 1429 2249 3500.0 4 \n", + "3922 7249 1827 13385.0 1 \n", + "54425 1070539 1800 19800.0 1 \n", + "69520 1216801 1623 12562.0 2 \n", + "30056 59330 1551 0.0 1 \n", + "3243 5441 1544 14133.0 2 \n", + "55195 1084435 1500 16500.0 1 \n", + "28983 57816 1485 0.0 2 \n", + "2231 2942 1307 100.0 2 \n", + "23 24 1266 0.0 2 \n", + "4513 9592 1211 62.0 4 \n", + "2936 5059 1186 6308.0 3 \n", + "11484 25100 1123 0.0 1 \n", + "934 1326 1098 798.0 3 \n", + "30156 59490 1088 0.0 1 \n", + "36478 251268 1086 0.0 2 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n", - "1 0 2018-04-07 12:55:07+00:00 2020-03-08 12:06:43+00:00 \n", - "2 1 2019-09-19 15:15:01+00:00 2023-09-27 09:13:09+00:00 \n", - "3 1 2019-09-19 15:43:49+00:00 2021-09-02 18:42:19+00:00 \n", - "4 1 2019-09-19 15:45:36+00:00 2019-09-19 15:45:36+00:00 \n", - "... ... ... ... \n", - "73513 1 2023-11-08 16:51:19+00:00 2023-11-08 16:51:19+00:00 \n", - "73514 1 2023-11-08 17:17:51+00:00 2023-11-08 17:17:51+00:00 \n", - "73515 1 2023-11-08 17:23:54+00:00 2023-11-08 17:23:54+00:00 \n", - "73516 1 2023-11-08 18:32:18+00:00 2023-11-08 18:32:18+00:00 \n", - "73517 1 2023-11-08 19:30:28+00:00 2023-11-08 19:30:28+00:00 \n", + "3615 1 2015-09-09 13:48:38+00:00 2023-11-03 09:42:40+00:00 \n", + "39 1 2014-01-23 16:56:57+00:00 2023-10-25 09:13:16+00:00 \n", + "11 1 2018-04-04 07:46:31+00:00 2023-11-04 13:46:59+00:00 \n", + "32809 1 2020-08-18 08:32:57+00:00 2022-08-25 13:08:38+00:00 \n", + "3708 1 2018-03-26 11:13:43+00:00 2021-08-26 12:49:17+00:00 \n", + "32616 1 2017-11-28 13:52:15+00:00 2022-09-07 12:55:33+00:00 \n", + "78 1 2017-01-05 13:04:58+00:00 2022-08-30 11:51:34+00:00 \n", + "35295 1 2021-05-28 10:22:33+00:00 2023-11-06 15:59:22+00:00 \n", + "3377 1 2018-10-25 11:04:24+00:00 2022-02-24 07:47:20+00:00 \n", + "30011 1 2019-11-25 08:52:48+00:00 2023-06-12 14:05:19+00:00 \n", + "34937 1 2018-02-08 12:54:01+00:00 2023-10-02 08:13:05+00:00 \n", + "270 1 2014-01-24 15:16:17+00:00 2023-10-16 10:19:22+00:00 \n", + "866 1 2017-03-29 08:00:09+00:00 2022-09-19 12:55:15+00:00 \n", + "1022 1 2014-12-03 14:56:38+00:00 2023-11-06 08:30:37+00:00 \n", + "3922 1 2019-05-07 12:34:56+00:00 2021-10-26 12:28:40+00:00 \n", + "54425 1 2022-05-02 16:09:03+00:00 2022-07-25 12:49:27+00:00 \n", + "69520 0 2023-06-16 14:16:04+00:00 2023-09-29 16:34:38+00:00 \n", + "30056 0 2018-02-02 08:53:51+00:00 2023-11-06 10:22:14+00:00 \n", + "3243 1 2017-12-14 12:50:23+00:00 2022-09-22 08:21:47+00:00 \n", + "55195 1 2022-05-18 08:04:41+00:00 2022-09-27 14:32:13+00:00 \n", + "28983 1 2019-01-21 14:19:18+00:00 2023-05-22 07:30:55+00:00 \n", + "2231 1 2017-10-25 15:06:58+00:00 2023-06-29 09:33:58+00:00 \n", + "23 1 2015-09-30 16:07:52+00:00 2023-10-19 07:20:48+00:00 \n", + "4513 1 2018-02-25 07:17:19+00:00 2023-10-17 09:39:40+00:00 \n", + "2936 1 2018-02-01 11:16:51+00:00 2023-05-22 13:41:22+00:00 \n", + "11484 1 2015-12-21 15:38:05+00:00 2021-07-13 07:39:57+00:00 \n", + "934 1 2018-02-13 13:13:48+00:00 2023-02-01 08:39:45+00:00 \n", + "30156 0 2019-12-06 12:59:20+00:00 2023-10-05 08:23:50+00:00 \n", + "36478 1 2018-02-02 09:06:22+00:00 2023-06-30 07:22:46+00:00 \n", "\n", " time_between_purchase nb_tickets_internet \n", "0 3803 days 05:21:47 3053.0 \n", - "1 700 days 23:11:36 0.0 \n", - "2 1468 days 17:58:08 6.0 \n", - "3 714 days 02:58:30 4.0 \n", - "4 0 days 00:00:00 2.0 \n", - "... ... ... \n", - "73513 0 days 00:00:00 3.0 \n", - "73514 0 days 00:00:00 4.0 \n", - "73515 0 days 00:00:00 1.0 \n", - "73516 0 days 00:00:00 2.0 \n", - "73517 0 days 00:00:00 2.0 \n", - "\n", - "[73518 rows x 9 columns]" + "3615 2976 days 19:54:02 30896.0 \n", + "39 3561 days 16:16:19 13993.0 \n", + "11 2040 days 06:00:28 167.0 \n", + "32809 737 days 04:35:41 5851.0 \n", + "3708 1249 days 01:35:34 5481.0 \n", + "32616 1743 days 23:03:18 826.0 \n", + "78 2062 days 22:46:36 3562.0 \n", + "35295 892 days 05:36:49 869.0 \n", + "3377 1217 days 20:42:56 3294.0 \n", + "30011 1295 days 05:12:31 52.0 \n", + "34937 2061 days 19:19:04 448.0 \n", + "270 3551 days 19:03:05 1479.0 \n", + "866 2000 days 04:55:06 104.0 \n", + "1022 3259 days 17:33:59 690.0 \n", + "3922 902 days 23:53:44 1827.0 \n", + "54425 83 days 20:40:24 1800.0 \n", + "69520 105 days 02:18:34 0.0 \n", + "30056 2103 days 01:28:23 0.0 \n", + "3243 1742 days 19:31:24 1384.0 \n", + "55195 132 days 06:27:32 1500.0 \n", + "28983 1581 days 17:11:37 357.0 \n", + "2231 2072 days 18:27:00 676.0 \n", + "23 2940 days 15:12:56 556.0 \n", + "4513 2060 days 02:22:21 353.0 \n", + "2936 1936 days 02:24:31 1182.0 \n", + "11484 2030 days 16:01:52 1123.0 \n", + "934 1813 days 19:25:57 266.0 \n", + "30156 1398 days 19:24:30 0.0 \n", + "36478 1973 days 22:16:24 279.0 " ] }, - "execution_count": 76, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], - "source": [ - "df1_tickets_kpi" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "de1ecf15-1aa1-4aa2-9467-8ad8e9be5856", - "metadata": {}, - "outputs": [], - "source": [ - " df_tickets_information_copy = df1_products_purchased_reduced.copy()\n", - "\n", - " # Dummy : Canal de vente en ligne\n", - " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", - " df_tickets_information_copy['vente_internet'] = df_tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", - "\n", - " # Proportion de vente en ligne\n", - " prop_vente_internet = df_tickets_information_copy[df_tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "9cd36178-11dc-409c-b148-fb1d208c2faf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idticket_id
013053
136
244
352
462
.........
5674412561333
5674512561344
5674612561351
5674712561362
5674812561372
\n", - "

56749 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " customer_id ticket_id\n", - "0 1 3053\n", - "1 3 6\n", - "2 4 4\n", - "3 5 2\n", - "4 6 2\n", - "... ... ...\n", - "56744 1256133 3\n", - "56745 1256134 4\n", - "56746 1256135 1\n", - "56747 1256136 2\n", - "56748 1256137 2\n", - "\n", - "[56749 rows x 2 columns]" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "prop_vente_internet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", - "metadata": {}, - "outputs": [], "source": [ "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)" ] From d23d047fb47817aea9d628066605912a9f07eb55 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 18:26:46 +0000 Subject: [PATCH 4/8] Ajout exportation vers bucket commun et modif access key --- 0_Cleaning_and_merge.ipynb | 1061 ++++++++++-------------------------- 1 file changed, 291 insertions(+), 770 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 8b723e0..89910f7 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -38,8 +38,7 @@ "outputs": [], "source": [ "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},key ='WKTGH4YGUBAT3TR0OSUR', secret = 'g8ozi6ZUrBy8DzaAip4F7zOizbr4DKf4RgYNseqU', token = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJXS1RHSDRZR1VCQVQzVFIwT1NVUiIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiaHR0cHM6Ly9vbnl4aWEubGFiLmdyb3VwZS1nZW5lcy5mciJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTcwNzU4NjUwMCwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJhbnRvaW5lLmpvdWJyZWxAZW5zYWUuZnIiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiZXhwIjoxNzA3NjczMDQ3LCJmYW1pbHlfbmFtZSI6IkpPVUJSRUwiLCJnaXZlbl9uYW1lIjoiQW50b2luZSIsImdyb3VwcyI6WyJiZGMyMzI0LXRlYW0xIl0sImlhdCI6MTcwNzU4NjY0NywiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiI1MjQ2MDZmMS1lYWM3LTQxZDgtYTEzMy04MGZjMDk0MGVlNzEiLCJuYW1lIjoiQW50b2luZSBKT1VCUkVMIiwicG9saWN5Ijoic3Rzb25seSIsInByZWZlcnJlZF91c2VybmFtZSI6ImFqb3VicmVsLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNlc3Npb25fc3RhdGUiOiI1OTk2MWNkYy0xNmFiLTQ4MTAtYWE4Zi1iZGUyMjkwNjhiNzUiLCJzaWQiOiI1OTk2MWNkYy0xNmFiLTQ4MTAtYWE4Zi1iZGUyMjkwNjhiNzUiLCJzdWIiOiIwNWYwZDk3Mi1jNWM4LTQyNmYtODAwZC00NmQ0OGU4NjkwMzUiLCJ0eXAiOiJCZWFyZXIifQ.-imw-N4bk1uCcQGobkxhsRoeBAqxC9rT7PifElbC7ODOStnwIulc7HRR2fmtiqI2PdyrfnVvzfmIPK1g056HbA')" ] }, { @@ -60,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 93, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], @@ -810,153 +809,6 @@ "df1_campaigns_information.head()" ] }, - { - "cell_type": "code", - "execution_count": 18, - "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", - "metadata": {}, - "outputs": [], - "source": [ - "def campaigns_kpi_function(campaigns_information = None):\n", - " # Nombre de campagnes de mails\n", - " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", - " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n", - " # Temps d'ouverture en min moyen \n", - " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n", - " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n", - "\n", - " # Nombre de mail ouvert \n", - " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n", - " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n", - " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", - " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n", - "\n", - " # Fusion des indicateurs\n", - " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n", - " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n", - "\n", - " # Remplir les NaN : nb_campaigns_opened\n", - " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n", - "\n", - " # Remplir les NaT : time_to_open (??)\n", - "\n", - " return campaigns_reduced\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "24537647-bc29-4777-9848-ac4120a4aa60", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_8302/3700263836.py:11: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n" - ] - } - ], - "source": [ - "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idnb_campaignsnb_campaigns_openedtime_to_open
0240.0NaT
13222124.01 days 00:28:30.169354838
2477.01 days 04:31:01.428571428
3540.0NaT
46200.0NaT
\n", - "
" - ], - "text/plain": [ - " customer_id nb_campaigns nb_campaigns_opened time_to_open\n", - "0 2 4 0.0 NaT\n", - "1 3 222 124.0 1 days 00:28:30.169354838\n", - "2 4 7 7.0 1 days 04:31:01.428571428\n", - "3 5 4 0.0 NaT\n", - "4 6 20 0.0 NaT" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_campaigns_kpi.head()" - ] - }, { "cell_type": "markdown", "id": "56520a97-ede8-4920-a211-3b5b136af33d", @@ -1899,20 +1751,175 @@ "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')" ] }, + { + "cell_type": "markdown", + "id": "d7c3668a-c016-4bd0-837e-04af328ff14f", + "metadata": {}, + "source": [ + "# Construction des variables explicatives" + ] + }, + { + "cell_type": "markdown", + "id": "314f1b7f-ae48-4c6f-8469-9ce879043243", + "metadata": {}, + "source": [ + "## KPI campaigns" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "52db7bcb-3fb7-48e5-b612-4e22bdab4a94", + "execution_count": 18, + "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def campaigns_kpi_function(campaigns_information = None):\n", + " # Nombre de campagnes de mails\n", + " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", + " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n", + " # Temps d'ouverture en min moyen \n", + " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n", + " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n", + "\n", + " # Nombre de mail ouvert \n", + " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n", + " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n", + " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", + " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n", + "\n", + " # Fusion des indicateurs\n", + " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n", + " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n", + "\n", + " # Remplir les NaN : nb_campaigns_opened\n", + " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n", + "\n", + " # Remplir les NaT : time_to_open (??)\n", + "\n", + " return campaigns_reduced\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "24537647-bc29-4777-9848-ac4120a4aa60", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_8302/3700263836.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n" + ] + } + ], + "source": [ + "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idnb_campaignsnb_campaigns_openedtime_to_open
0240.0NaT
13222124.01 days 00:28:30.169354838
2477.01 days 04:31:01.428571428
3540.0NaT
46200.0NaT
\n", + "
" + ], + "text/plain": [ + " customer_id nb_campaigns nb_campaigns_opened time_to_open\n", + "0 2 4 0.0 NaT\n", + "1 3 222 124.0 1 days 00:28:30.169354838\n", + "2 4 7 7.0 1 days 04:31:01.428571428\n", + "3 5 4 0.0 NaT\n", + "4 6 20 0.0 NaT" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_kpi.head()" + ] }, { "cell_type": "markdown", "id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2", "metadata": {}, "source": [ - "### KPI tickets" + "## KPI tickets" ] }, { @@ -1978,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 78, "id": "3d8b0875-b409-44ce-b688-d9d6758782d3", "metadata": {}, "outputs": [ @@ -2105,177 +2112,48 @@ " l'école des magiciens\n", " 2018\n", " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 1826667\n", - " 18643494\n", - " 81\n", - " 4\n", - " vad\n", - " 2022-08-02 12:18:16+00:00\n", - " Billet en nombre\n", - " 11.0\n", - " pricing_formula\n", - " False\n", - " spectacle vivant\n", - " mucem\n", - " en nb entrées tr\n", - " NaN\n", - " 2022\n", - " \n", - " \n", - " 1826668\n", - " 18643495\n", - " 81\n", - " 4\n", - " vad\n", - " 2022-08-02 12:18:16+00:00\n", - " Billet en nombre\n", - " 11.0\n", - " pricing_formula\n", - " False\n", - " spectacle vivant\n", - " mucem\n", - " en nb entrées tr\n", - " NaN\n", - " 2022\n", - " \n", - " \n", - " 1826669\n", - " 18643496\n", - " 81\n", - " 4\n", - " vad\n", - " 2022-08-02 12:18:16+00:00\n", - " Billet en nombre\n", - " 11.0\n", - " pricing_formula\n", - " False\n", - " spectacle vivant\n", - " mucem\n", - " en nb entrées tr\n", - " NaN\n", - " 2022\n", - " \n", - " \n", - " 1826670\n", - " 18643497\n", - " 81\n", - " 4\n", - " vad\n", - " 2022-08-02 12:18:16+00:00\n", - " Billet en nombre\n", - " 11.0\n", - " pricing_formula\n", - " False\n", - " spectacle vivant\n", - " mucem\n", - " en nb entrées tr\n", - " NaN\n", - " 2022\n", - " \n", - " \n", - " 1826671\n", - " 19853111\n", - " 62763\n", - " 4\n", - " vad\n", - " 2022-11-04 14:25:42+00:00\n", - " Billet en nombre\n", - " 0.0\n", - " pricing_formula\n", - " False\n", - " spectacle vivant\n", - " mucem\n", - " indiv entrées gr\n", - " NaN\n", - " 2022\n", - " \n", " \n", "\n", - "

1826672 rows × 14 columns

\n", "" ], "text/plain": [ - " ticket_id customer_id event_type_id supplier_name \\\n", - "0 13070859 48187 4 vente en ligne \n", - "1 13070855 48187 4 vente en ligne \n", - "2 13070856 48187 4 vente en ligne \n", - "3 13070857 48187 4 vente en ligne \n", - "4 13070858 48187 4 vente en ligne \n", - "... ... ... ... ... \n", - "1826667 18643494 81 4 vad \n", - "1826668 18643495 81 4 vad \n", - "1826669 18643496 81 4 vad \n", - "1826670 18643497 81 4 vad \n", - "1826671 19853111 62763 4 vad \n", + " ticket_id customer_id event_type_id supplier_name \\\n", + "0 13070859 48187 4 vente en ligne \n", + "1 13070855 48187 4 vente en ligne \n", + "2 13070856 48187 4 vente en ligne \n", + "3 13070857 48187 4 vente en ligne \n", + "4 13070858 48187 4 vente en ligne \n", "\n", - " purchase_date type_of_ticket_name amount \\\n", - "0 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", - "1 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", - "2 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", - "3 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", - "4 2018-12-28 14:47:50+00:00 Atelier 8.0 \n", - "... ... ... ... \n", - "1826667 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", - "1826668 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", - "1826669 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", - "1826670 2022-08-02 12:18:16+00:00 Billet en nombre 11.0 \n", - "1826671 2022-11-04 14:25:42+00:00 Billet en nombre 0.0 \n", + " purchase_date type_of_ticket_name amount children \\\n", + "0 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", + "1 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", + "2 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", + "3 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", + "4 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", "\n", - " children is_full_price name_event_types name_facilities \\\n", - "0 pricing_formula False spectacle vivant mucem \n", - "1 pricing_formula False spectacle vivant mucem \n", - "2 pricing_formula False spectacle vivant mucem \n", - "3 pricing_formula False spectacle vivant mucem \n", - "4 pricing_formula False spectacle vivant mucem \n", - "... ... ... ... ... \n", - "1826667 pricing_formula False spectacle vivant mucem \n", - "1826668 pricing_formula False spectacle vivant mucem \n", - "1826669 pricing_formula False spectacle vivant mucem \n", - "1826670 pricing_formula False spectacle vivant mucem \n", - "1826671 pricing_formula False spectacle vivant mucem \n", + " is_full_price name_event_types name_facilities name_categories \\\n", + "0 False spectacle vivant mucem indiv prog enfant \n", + "1 False spectacle vivant mucem indiv prog enfant \n", + "2 False spectacle vivant mucem indiv prog enfant \n", + "3 False spectacle vivant mucem indiv prog enfant \n", + "4 False spectacle vivant mucem indiv prog enfant \n", "\n", - " name_categories name_events name_seasons \n", - "0 indiv prog enfant l'école des magiciens 2018 \n", - "1 indiv prog enfant l'école des magiciens 2018 \n", - "2 indiv prog enfant l'école des magiciens 2018 \n", - "3 indiv prog enfant l'école des magiciens 2018 \n", - "4 indiv prog enfant l'école des magiciens 2018 \n", - "... ... ... ... \n", - "1826667 en nb entrées tr NaN 2022 \n", - "1826668 en nb entrées tr NaN 2022 \n", - "1826669 en nb entrées tr NaN 2022 \n", - "1826670 en nb entrées tr NaN 2022 \n", - "1826671 indiv entrées gr NaN 2022 \n", - "\n", - "[1826672 rows x 14 columns]" + " name_events name_seasons \n", + "0 l'école des magiciens 2018 \n", + "1 l'école des magiciens 2018 \n", + "2 l'école des magiciens 2018 \n", + "3 l'école des magiciens 2018 \n", + "4 l'école des magiciens 2018 " ] }, - "execution_count": 33, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Importance des suppliers\n", - "df1_products_purchased_reduced" + "df1_products_purchased_reduced.head()" ] }, { @@ -2291,7 +2169,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 82, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], @@ -2305,11 +2183,11 @@ " tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n", "\n", " # Proportion de vente en ligne\n", - " prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby('customer_id')['ticket_id'].count().reset_index()\n", + " prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()\n", " prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n", " \n", " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n", - " .groupby([ 'customer_id']) # 'event_type_id',\n", + " .groupby(['customer_id', 'event_type_id']) \n", " .agg({'ticket_id': 'count', \n", " 'amount' : 'sum',\n", " 'supplier_name': 'nunique',\n", @@ -2322,11 +2200,12 @@ " tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n", " 'amount_sum' : 'total_amount',\n", " 'supplier_name_nunique' : 'nb_suppliers', \n", - " 'customer_id_' : 'customer_id'}, inplace = True)\n", + " 'customer_id_' : 'customer_id',\n", + " 'event_type_id_' : 'event_type_id'}, inplace = True)\n", " \n", " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n", "\n", - " tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = 'customer_id', how = 'left')\n", + " tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')\n", " tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n", " \n", " return tickets_kpi\n", @@ -2335,7 +2214,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 83, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -2343,9 +2222,33 @@ "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)" ] }, + { + "cell_type": "markdown", + "id": "597b241e-a83d-4b7c-8ad7-eec50295dff2", + "metadata": {}, + "source": [ + "#### Exportation" + ] + }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 92, + "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Exportation vers 'projet-bdc2324-team1'\n", + "BUCKET_OUT = \"projet-bdc2324-team1\"\n", + "FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n", + "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n", + "\n", + "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", + " df1_tickets_kpi.to_csv(file_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", "metadata": {}, "outputs": [ @@ -2371,6 +2274,7 @@ " \n", " \n", " customer_id\n", + " event_type_id\n", " nb_tickets\n", " total_amount\n", " nb_suppliers\n", @@ -2383,474 +2287,104 @@ " \n", " \n", " \n", + " 1\n", + " 1\n", + " 4\n", + " 453242\n", + " 3248965.5\n", + " 6\n", + " 1\n", + " 2013-09-23 14:45:01+00:00\n", + " 2023-11-03 14:11:01+00:00\n", + " 3692 days 23:26:00\n", + " 2988.0\n", + " \n", + " \n", " 0\n", " 1\n", - " 1256574\n", - " 8830567.5\n", + " 2\n", + " 384226\n", + " 2686540.5\n", " 7\n", " 1\n", + " 2014-12-03 14:55:37+00:00\n", + " 2023-11-04 15:12:16+00:00\n", + " 3258 days 00:16:39\n", + " 51.0\n", + " \n", + " \n", + " 3\n", + " 1\n", + " 6\n", + " 217356\n", + " 1435871.5\n", + " 5\n", + " 1\n", + " 2017-01-01 02:20:08+00:00\n", + " 2019-12-31 02:20:06+00:00\n", + " 1093 days 23:59:58\n", + " 5.0\n", + " \n", + " \n", + " 2\n", + " 1\n", + " 5\n", + " 201750\n", + " 1459190.0\n", + " 6\n", + " 1\n", " 2013-06-10 10:37:58+00:00\n", " 2023-11-08 15:59:45+00:00\n", " 3803 days 05:21:47\n", - " 3053.0\n", + " 9.0\n", " \n", " \n", - " 3615\n", + " 5032\n", " 6733\n", - " 35527\n", - " 1188.0\n", - " 4\n", - " 1\n", - " 2015-09-09 13:48:38+00:00\n", - " 2023-11-03 09:42:40+00:00\n", - " 2976 days 19:54:02\n", - " 30896.0\n", - " \n", - " \n", - " 39\n", - " 41\n", - " 16263\n", - " 37642.0\n", " 6\n", - " 1\n", - " 2014-01-23 16:56:57+00:00\n", - " 2023-10-25 09:13:16+00:00\n", - " 3561 days 16:16:19\n", - " 13993.0\n", - " \n", - " \n", - " 11\n", - " 12\n", - " 5871\n", - " 38767.0\n", - " 2\n", - " 1\n", - " 2018-04-04 07:46:31+00:00\n", - " 2023-11-04 13:46:59+00:00\n", - " 2040 days 06:00:28\n", - " 167.0\n", - " \n", - " \n", - " 32809\n", - " 63488\n", - " 5851\n", - " 64350.0\n", - " 1\n", - " 1\n", - " 2020-08-18 08:32:57+00:00\n", - " 2022-08-25 13:08:38+00:00\n", - " 737 days 04:35:41\n", - " 5851.0\n", - " \n", - " \n", - " 3708\n", - " 6916\n", - " 5482\n", - " 51489.5\n", - " 2\n", - " 1\n", - " 2018-03-26 11:13:43+00:00\n", - " 2021-08-26 12:49:17+00:00\n", - " 1249 days 01:35:34\n", - " 5481.0\n", - " \n", - " \n", - " 32616\n", - " 63194\n", - " 4507\n", - " 13232.0\n", + " 14208\n", + " 0.0\n", " 3\n", " 1\n", - " 2017-11-28 13:52:15+00:00\n", - " 2022-09-07 12:55:33+00:00\n", - " 1743 days 23:03:18\n", - " 826.0\n", - " \n", - " \n", - " 78\n", - " 81\n", - " 3562\n", - " 38746.0\n", - " 1\n", - " 1\n", - " 2017-01-05 13:04:58+00:00\n", - " 2022-08-30 11:51:34+00:00\n", - " 2062 days 22:46:36\n", - " 3562.0\n", - " \n", - " \n", - " 35295\n", - " 84002\n", - " 3403\n", - " 19830.0\n", - " 4\n", - " 1\n", - " 2021-05-28 10:22:33+00:00\n", - " 2023-11-06 15:59:22+00:00\n", - " 892 days 05:36:49\n", - " 869.0\n", - " \n", - " \n", - " 3377\n", - " 5618\n", - " 3294\n", - " 31684.5\n", - " 1\n", - " 1\n", - " 2018-10-25 11:04:24+00:00\n", - " 2022-02-24 07:47:20+00:00\n", - " 1217 days 20:42:56\n", - " 3294.0\n", - " \n", - " \n", - " 30011\n", - " 59259\n", - " 2591\n", - " 4350.0\n", - " 3\n", - " 1\n", - " 2019-11-25 08:52:48+00:00\n", - " 2023-06-12 14:05:19+00:00\n", - " 1295 days 05:12:31\n", - " 52.0\n", - " \n", - " \n", - " 34937\n", - " 74876\n", - " 2571\n", - " 2600.0\n", - " 2\n", - " 1\n", - " 2018-02-08 12:54:01+00:00\n", - " 2023-10-02 08:13:05+00:00\n", - " 2061 days 19:19:04\n", - " 448.0\n", - " \n", - " \n", - " 270\n", - " 295\n", - " 2570\n", - " 17678.5\n", - " 6\n", - " 1\n", - " 2014-01-24 15:16:17+00:00\n", - " 2023-10-16 10:19:22+00:00\n", - " 3551 days 19:03:05\n", - " 1479.0\n", - " \n", - " \n", - " 866\n", - " 1221\n", - " 2320\n", - " 9652.0\n", - " 2\n", - " 1\n", - " 2017-03-29 08:00:09+00:00\n", - " 2022-09-19 12:55:15+00:00\n", - " 2000 days 04:55:06\n", - " 104.0\n", - " \n", - " \n", - " 1022\n", - " 1429\n", - " 2249\n", - " 3500.0\n", - " 4\n", - " 1\n", - " 2014-12-03 14:56:38+00:00\n", - " 2023-11-06 08:30:37+00:00\n", - " 3259 days 17:33:59\n", - " 690.0\n", - " \n", - " \n", - " 3922\n", - " 7249\n", - " 1827\n", - " 13385.0\n", - " 1\n", - " 1\n", - " 2019-05-07 12:34:56+00:00\n", - " 2021-10-26 12:28:40+00:00\n", - " 902 days 23:53:44\n", - " 1827.0\n", - " \n", - " \n", - " 54425\n", - " 1070539\n", - " 1800\n", - " 19800.0\n", - " 1\n", - " 1\n", - " 2022-05-02 16:09:03+00:00\n", - " 2022-07-25 12:49:27+00:00\n", - " 83 days 20:40:24\n", - " 1800.0\n", - " \n", - " \n", - " 69520\n", - " 1216801\n", - " 1623\n", - " 12562.0\n", - " 2\n", - " 0\n", - " 2023-06-16 14:16:04+00:00\n", - " 2023-09-29 16:34:38+00:00\n", - " 105 days 02:18:34\n", - " 0.0\n", - " \n", - " \n", - " 30056\n", - " 59330\n", - " 1551\n", - " 0.0\n", - " 1\n", - " 0\n", - " 2018-02-02 08:53:51+00:00\n", - " 2023-11-06 10:22:14+00:00\n", - " 2103 days 01:28:23\n", - " 0.0\n", - " \n", - " \n", - " 3243\n", - " 5441\n", - " 1544\n", - " 14133.0\n", - " 2\n", - " 1\n", - " 2017-12-14 12:50:23+00:00\n", - " 2022-09-22 08:21:47+00:00\n", - " 1742 days 19:31:24\n", - " 1384.0\n", - " \n", - " \n", - " 55195\n", - " 1084435\n", - " 1500\n", - " 16500.0\n", - " 1\n", - " 1\n", - " 2022-05-18 08:04:41+00:00\n", - " 2022-09-27 14:32:13+00:00\n", - " 132 days 06:27:32\n", - " 1500.0\n", - " \n", - " \n", - " 28983\n", - " 57816\n", - " 1485\n", - " 0.0\n", - " 2\n", - " 1\n", - " 2019-01-21 14:19:18+00:00\n", - " 2023-05-22 07:30:55+00:00\n", - " 1581 days 17:11:37\n", - " 357.0\n", - " \n", - " \n", - " 2231\n", - " 2942\n", - " 1307\n", - " 100.0\n", - " 2\n", - " 1\n", - " 2017-10-25 15:06:58+00:00\n", - " 2023-06-29 09:33:58+00:00\n", - " 2072 days 18:27:00\n", - " 676.0\n", - " \n", - " \n", - " 23\n", - " 24\n", - " 1266\n", - " 0.0\n", - " 2\n", - " 1\n", - " 2015-09-30 16:07:52+00:00\n", - " 2023-10-19 07:20:48+00:00\n", - " 2940 days 15:12:56\n", - " 556.0\n", - " \n", - " \n", - " 4513\n", - " 9592\n", - " 1211\n", - " 62.0\n", - " 4\n", - " 1\n", - " 2018-02-25 07:17:19+00:00\n", - " 2023-10-17 09:39:40+00:00\n", - " 2060 days 02:22:21\n", - " 353.0\n", - " \n", - " \n", - " 2936\n", - " 5059\n", - " 1186\n", - " 6308.0\n", - " 3\n", - " 1\n", - " 2018-02-01 11:16:51+00:00\n", - " 2023-05-22 13:41:22+00:00\n", - " 1936 days 02:24:31\n", - " 1182.0\n", - " \n", - " \n", - " 11484\n", - " 25100\n", - " 1123\n", - " 0.0\n", - " 1\n", - " 1\n", - " 2015-12-21 15:38:05+00:00\n", - " 2021-07-13 07:39:57+00:00\n", - " 2030 days 16:01:52\n", - " 1123.0\n", - " \n", - " \n", - " 934\n", - " 1326\n", - " 1098\n", - " 798.0\n", - " 3\n", - " 1\n", - " 2018-02-13 13:13:48+00:00\n", - " 2023-02-01 08:39:45+00:00\n", - " 1813 days 19:25:57\n", - " 266.0\n", - " \n", - " \n", - " 30156\n", - " 59490\n", - " 1088\n", - " 0.0\n", - " 1\n", - " 0\n", - " 2019-12-06 12:59:20+00:00\n", - " 2023-10-05 08:23:50+00:00\n", - " 1398 days 19:24:30\n", - " 0.0\n", - " \n", - " \n", - " 36478\n", - " 251268\n", - " 1086\n", - " 0.0\n", - " 2\n", - " 1\n", - " 2018-02-02 09:06:22+00:00\n", - " 2023-06-30 07:22:46+00:00\n", - " 1973 days 22:16:24\n", - " 279.0\n", + " 2017-01-11 15:00:54+00:00\n", + " 2019-11-27 09:47:06+00:00\n", + " 1049 days 18:46:12\n", + " 13497.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " customer_id nb_tickets total_amount nb_suppliers \\\n", - "0 1 1256574 8830567.5 7 \n", - "3615 6733 35527 1188.0 4 \n", - "39 41 16263 37642.0 6 \n", - "11 12 5871 38767.0 2 \n", - "32809 63488 5851 64350.0 1 \n", - "3708 6916 5482 51489.5 2 \n", - "32616 63194 4507 13232.0 3 \n", - "78 81 3562 38746.0 1 \n", - "35295 84002 3403 19830.0 4 \n", - "3377 5618 3294 31684.5 1 \n", - "30011 59259 2591 4350.0 3 \n", - "34937 74876 2571 2600.0 2 \n", - "270 295 2570 17678.5 6 \n", - "866 1221 2320 9652.0 2 \n", - "1022 1429 2249 3500.0 4 \n", - "3922 7249 1827 13385.0 1 \n", - "54425 1070539 1800 19800.0 1 \n", - "69520 1216801 1623 12562.0 2 \n", - "30056 59330 1551 0.0 1 \n", - "3243 5441 1544 14133.0 2 \n", - "55195 1084435 1500 16500.0 1 \n", - "28983 57816 1485 0.0 2 \n", - "2231 2942 1307 100.0 2 \n", - "23 24 1266 0.0 2 \n", - "4513 9592 1211 62.0 4 \n", - "2936 5059 1186 6308.0 3 \n", - "11484 25100 1123 0.0 1 \n", - "934 1326 1098 798.0 3 \n", - "30156 59490 1088 0.0 1 \n", - "36478 251268 1086 0.0 2 \n", + " customer_id event_type_id nb_tickets total_amount nb_suppliers \\\n", + "1 1 4 453242 3248965.5 6 \n", + "0 1 2 384226 2686540.5 7 \n", + "3 1 6 217356 1435871.5 5 \n", + "2 1 5 201750 1459190.0 6 \n", + "5032 6733 6 14208 0.0 3 \n", "\n", - " vente_internet_max purchase_date_min purchase_date_max \\\n", - "0 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n", - "3615 1 2015-09-09 13:48:38+00:00 2023-11-03 09:42:40+00:00 \n", - "39 1 2014-01-23 16:56:57+00:00 2023-10-25 09:13:16+00:00 \n", - "11 1 2018-04-04 07:46:31+00:00 2023-11-04 13:46:59+00:00 \n", - "32809 1 2020-08-18 08:32:57+00:00 2022-08-25 13:08:38+00:00 \n", - "3708 1 2018-03-26 11:13:43+00:00 2021-08-26 12:49:17+00:00 \n", - "32616 1 2017-11-28 13:52:15+00:00 2022-09-07 12:55:33+00:00 \n", - "78 1 2017-01-05 13:04:58+00:00 2022-08-30 11:51:34+00:00 \n", - "35295 1 2021-05-28 10:22:33+00:00 2023-11-06 15:59:22+00:00 \n", - "3377 1 2018-10-25 11:04:24+00:00 2022-02-24 07:47:20+00:00 \n", - "30011 1 2019-11-25 08:52:48+00:00 2023-06-12 14:05:19+00:00 \n", - "34937 1 2018-02-08 12:54:01+00:00 2023-10-02 08:13:05+00:00 \n", - "270 1 2014-01-24 15:16:17+00:00 2023-10-16 10:19:22+00:00 \n", - "866 1 2017-03-29 08:00:09+00:00 2022-09-19 12:55:15+00:00 \n", - "1022 1 2014-12-03 14:56:38+00:00 2023-11-06 08:30:37+00:00 \n", - "3922 1 2019-05-07 12:34:56+00:00 2021-10-26 12:28:40+00:00 \n", - "54425 1 2022-05-02 16:09:03+00:00 2022-07-25 12:49:27+00:00 \n", - "69520 0 2023-06-16 14:16:04+00:00 2023-09-29 16:34:38+00:00 \n", - "30056 0 2018-02-02 08:53:51+00:00 2023-11-06 10:22:14+00:00 \n", - "3243 1 2017-12-14 12:50:23+00:00 2022-09-22 08:21:47+00:00 \n", - "55195 1 2022-05-18 08:04:41+00:00 2022-09-27 14:32:13+00:00 \n", - "28983 1 2019-01-21 14:19:18+00:00 2023-05-22 07:30:55+00:00 \n", - "2231 1 2017-10-25 15:06:58+00:00 2023-06-29 09:33:58+00:00 \n", - "23 1 2015-09-30 16:07:52+00:00 2023-10-19 07:20:48+00:00 \n", - "4513 1 2018-02-25 07:17:19+00:00 2023-10-17 09:39:40+00:00 \n", - "2936 1 2018-02-01 11:16:51+00:00 2023-05-22 13:41:22+00:00 \n", - "11484 1 2015-12-21 15:38:05+00:00 2021-07-13 07:39:57+00:00 \n", - "934 1 2018-02-13 13:13:48+00:00 2023-02-01 08:39:45+00:00 \n", - "30156 0 2019-12-06 12:59:20+00:00 2023-10-05 08:23:50+00:00 \n", - "36478 1 2018-02-02 09:06:22+00:00 2023-06-30 07:22:46+00:00 \n", + " vente_internet_max purchase_date_min purchase_date_max \\\n", + "1 1 2013-09-23 14:45:01+00:00 2023-11-03 14:11:01+00:00 \n", + "0 1 2014-12-03 14:55:37+00:00 2023-11-04 15:12:16+00:00 \n", + "3 1 2017-01-01 02:20:08+00:00 2019-12-31 02:20:06+00:00 \n", + "2 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n", + "5032 1 2017-01-11 15:00:54+00:00 2019-11-27 09:47:06+00:00 \n", "\n", - " time_between_purchase nb_tickets_internet \n", - "0 3803 days 05:21:47 3053.0 \n", - "3615 2976 days 19:54:02 30896.0 \n", - "39 3561 days 16:16:19 13993.0 \n", - "11 2040 days 06:00:28 167.0 \n", - "32809 737 days 04:35:41 5851.0 \n", - "3708 1249 days 01:35:34 5481.0 \n", - "32616 1743 days 23:03:18 826.0 \n", - "78 2062 days 22:46:36 3562.0 \n", - "35295 892 days 05:36:49 869.0 \n", - "3377 1217 days 20:42:56 3294.0 \n", - "30011 1295 days 05:12:31 52.0 \n", - "34937 2061 days 19:19:04 448.0 \n", - "270 3551 days 19:03:05 1479.0 \n", - "866 2000 days 04:55:06 104.0 \n", - "1022 3259 days 17:33:59 690.0 \n", - "3922 902 days 23:53:44 1827.0 \n", - "54425 83 days 20:40:24 1800.0 \n", - "69520 105 days 02:18:34 0.0 \n", - "30056 2103 days 01:28:23 0.0 \n", - "3243 1742 days 19:31:24 1384.0 \n", - "55195 132 days 06:27:32 1500.0 \n", - "28983 1581 days 17:11:37 357.0 \n", - "2231 2072 days 18:27:00 676.0 \n", - "23 2940 days 15:12:56 556.0 \n", - "4513 2060 days 02:22:21 353.0 \n", - "2936 1936 days 02:24:31 1182.0 \n", - "11484 2030 days 16:01:52 1123.0 \n", - "934 1813 days 19:25:57 266.0 \n", - "30156 1398 days 19:24:30 0.0 \n", - "36478 1973 days 22:16:24 279.0 " + " time_between_purchase nb_tickets_internet \n", + "1 3692 days 23:26:00 2988.0 \n", + "0 3258 days 00:16:39 51.0 \n", + "3 1093 days 23:59:58 5.0 \n", + "2 3803 days 05:21:47 9.0 \n", + "5032 1049 days 18:46:12 13497.0 " ] }, - "execution_count": 77, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)" + "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(5)" ] }, { @@ -2882,19 +2416,6 @@ "df1_customer.head()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "\n", - "\n", - "df1_customer[df1_customer['customer_id'] == 1]" - ] - }, { "cell_type": "code", "execution_count": null, From 071b9fda3685ce03e6cbceff0203da35733e20b0 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 18:43:11 +0000 Subject: [PATCH 5/8] Modification de l'exportation --- 0_Cleaning_and_merge.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 89910f7..68efb55 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -2232,7 +2232,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 94, "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5", "metadata": {}, "outputs": [], @@ -2243,7 +2243,7 @@ "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n", "\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", - " df1_tickets_kpi.to_csv(file_out)" + " df1_tickets_kpi.to_csv(file_out, index = False)" ] }, { From 5f02915ae99daeee25f8aba304c85f3f87d6775b Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 18:53:59 +0000 Subject: [PATCH 6/8] Ajout travail Alexis --- 0_Cleaning_and_merge.ipynb | 766 +++++++++++++++++++++++++++++++++++++ 1 file changed, 766 insertions(+) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 68efb55..be348ed 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -1759,6 +1759,772 @@ "# Construction des variables explicatives" ] }, + { + "cell_type": "markdown", + "id": "b09c2964-bef9-489e-ad71-84959054531b", + "metadata": {}, + "source": [ + "## Alexis' work" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "4ab1c0d2-0097-4669-b984-b6822c976740", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_type_idavg_amount
026.150659
147.762474
254.452618
366.439463
\n", + "
" + ], + "text/plain": [ + " event_type_id avg_amount\n", + "0 2 6.150659\n", + "1 4 7.762474\n", + "2 5 4.452618\n", + "3 6 6.439463" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n", + " .agg({\"amount\" : \"mean\"}).reset_index()\n", + " .rename(columns = {'amount' : 'avg_amount'}))\n", + "\n", + "avg_amount" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idevent_type_idnb_ticketsavg_amount
0123842266.150659
1144532427.762474
2152017504.452618
3162173566.439463
4221436.150659
\n", + "
" + ], + "text/plain": [ + " customer_id event_type_id nb_tickets avg_amount\n", + "0 1 2 384226 6.150659\n", + "1 1 4 453242 7.762474\n", + "2 1 5 201750 4.452618\n", + "3 1 6 217356 6.439463\n", + "4 2 2 143 6.150659" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n", + " .agg({\"ticket_id\" : \"count\"}).reset_index()\n", + " .rename(columns = {'ticket_id' : 'nb_tickets'})\n", + " .merge(avg_amount, how='left', on='event_type_id'))\n", + "nb_tickets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", + "0 True NaN NaN NaN NaN NaN \n", + "1 True NaN NaN NaN NaN NaN \n", + "2 True NaN NaN NaN NaN NaN \n", + "3 True NaN NaN NaN NaN NaN \n", + "4 False NaN NaN NaN NaN NaN \n", + "\n", + " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", + "0 NaN 0 0.0 0 NaN \n", + "1 NaN 0 0.0 0 NaN \n", + "2 NaN 0 0.0 0 NaN \n", + "3 NaN 0 0.0 0 NaN \n", + "4 NaN 0 0.0 0 NaN \n", + "\n", + " average_price_basket average_ticket_basket total_price purchase_count \\\n", + "0 NaN NaN NaN 0 \n", + "1 NaN NaN NaN 0 \n", + "2 NaN NaN NaN 0 \n", + "3 NaN NaN NaN 0 \n", + "4 NaN NaN NaN 0 \n", + "\n", + " first_buying_date country age tenant_id nb_campaigns \\\n", + "0 NaT fr NaN 1311 NaN \n", + "1 NaT fr NaN 1311 NaN \n", + "2 NaT fr NaN 1311 NaN \n", + "3 NaT fr NaN 1311 NaN \n", + "4 NaT NaN NaN 1311 80.0 \n", + "\n", + " nb_campaigns_opened time_to_open \n", + "0 NaN NaT \n", + "1 NaN NaT \n", + "2 NaN NaT \n", + "3 NaN NaT \n", + "4 2.0 0 days 19:53:02.500000 " + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fusion avec KPI campaigns liés au customer\n", + "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", + "df1_customer.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "a89fad43-ee68-4081-9384-3e9f08ec6a59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (156289, 31)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_openevent_type_idnb_ticketsavg_amount
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", + "0 True NaN NaN NaN NaN NaN \n", + "1 True NaN NaN NaN NaN NaN \n", + "2 True NaN NaN NaN NaN NaN \n", + "3 True NaN NaN NaN NaN NaN \n", + "4 False NaN NaN NaN NaN NaN \n", + "\n", + " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", + "0 NaN 0 0.0 0 NaN \n", + "1 NaN 0 0.0 0 NaN \n", + "2 NaN 0 0.0 0 NaN \n", + "3 NaN 0 0.0 0 NaN \n", + "4 NaN 0 0.0 0 NaN \n", + "\n", + " average_price_basket average_ticket_basket total_price purchase_count \\\n", + "0 NaN NaN NaN 0 \n", + "1 NaN NaN NaN 0 \n", + "2 NaN NaN NaN 0 \n", + "3 NaN NaN NaN 0 \n", + "4 NaN NaN NaN 0 \n", + "\n", + " first_buying_date country age tenant_id nb_campaigns \\\n", + "0 NaT fr NaN 1311 NaN \n", + "1 NaT fr NaN 1311 NaN \n", + "2 NaT fr NaN 1311 NaN \n", + "3 NaT fr NaN 1311 NaN \n", + "4 NaT NaN NaN 1311 80.0 \n", + "\n", + " nb_campaigns_opened time_to_open event_type_id nb_tickets \\\n", + "0 NaN NaT NaN NaN \n", + "1 NaN NaT NaN NaN \n", + "2 NaN NaT NaN NaN \n", + "3 NaN NaT NaN NaN \n", + "4 2.0 0 days 19:53:02.500000 NaN NaN \n", + "\n", + " avg_amount \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n", + "print(\"shape : \", df1_customer_product.shape)\n", + "df1_customer_product.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "a19fec00-4ece-400c-937c-ce5cd8daccfd", + "metadata": {}, + "outputs": [], + "source": [ + "df1_customer_product.to_csv(\"customer_product.csv\", index = False)" + ] + }, { "cell_type": "markdown", "id": "314f1b7f-ae48-4c6f-8469-9ce879043243", From 05538d34901e7656bbd0b4a1f86a5e3dbb768275 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 18:57:33 +0000 Subject: [PATCH 7/8] Ajout fusions --- 0_Cleaning_and_merge.ipynb | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index be348ed..b95fec5 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -3172,16 +3172,6 @@ "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9740d64a-e5eb-4967-a534-ca6177546465", - "metadata": {}, - "outputs": [], - "source": [ - "df1_customer.head()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -3190,7 +3180,7 @@ "outputs": [], "source": [ "# Fusion avec KPI liés au comportement d'achat\n", - "# df1_customer_product = pd.merge(df1_products_purchased_reduced, df1_products_purchased, on = 'customer_id', how = 'outer')" + "df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')" ] }, { From 28b69938321ac8a75ed3947c361a7feb12d4d6b0 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 10 Feb 2024 21:24:37 +0000 Subject: [PATCH 8/8] =?UTF-8?q?Nettoyage=20et=20r=C3=A9organisation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 0_Cleaning_and_merge.ipynb | 1862 +++++++++++++----------------------- 1 file changed, 682 insertions(+), 1180 deletions(-) diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index b95fec5..ced5bdf 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 3, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], @@ -78,7 +78,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_8302/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_42764/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -242,23 +242,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_8302/3092893564.py:5: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3092893564.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", - "/tmp/ipykernel_8302/3092893564.py:9: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3092893564.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", - "/tmp/ipykernel_8302/3092893564.py:10: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3092893564.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n", - "/tmp/ipykernel_8302/3092893564.py:14: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3092893564.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -439,7 +439,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_8302/3848597476.py:4: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3848597476.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -451,165 +451,6 @@ "df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)" ] }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_id
target_name
consentement optin mediation specialisee150000
consentement optin jeune public149979
consentement optin b2c108909
Arenametrix_bascule tel vers sib35216
consentement optout b2c34523
\n", - "
" - ], - "text/plain": [ - " customer_id\n", - "target_name \n", - "consentement optin mediation specialisee 150000\n", - "consentement optin jeune public 149979\n", - "consentement optin b2c 108909\n", - "Arenametrix_bascule tel vers sib 35216\n", - "consentement optout b2c 34523" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_id
target_name
Arenametrix_bascule tel vers sib35216
Autres_interet_exposition1021
COM Inscrits NL générale (historique)23005
Contacts_prenomsdoubles11643
DDCP MD Procès du Siècle1684
\n", - "
" - ], - "text/plain": [ - " customer_id\n", - "target_name \n", - "Arenametrix_bascule tel vers sib 35216\n", - "Autres_interet_exposition 1021\n", - "COM Inscrits NL générale (historique) 23005\n", - "Contacts_prenomsdoubles 11643\n", - "DDCP MD Procès du Siècle 1684" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n", - "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()" - ] - }, { "cell_type": "markdown", "id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f", @@ -620,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85", "metadata": {}, "outputs": [], @@ -645,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f", "metadata": {}, "outputs": [ @@ -653,19 +494,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", - "/tmp/ipykernel_8302/1967867975.py:15: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/1967867975.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -680,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "c24457e7-3cad-451a-a65b-7373b656bd6e", "metadata": { "scrolled": true @@ -800,7 +641,7 @@ "4 404 2021-03-27 23:00:00+00:00 " ] }, - "execution_count": 17, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -814,7 +655,7 @@ "id": "56520a97-ede8-4920-a211-3b5b136af33d", "metadata": {}, "source": [ - "## Create Products Table" + "## Product area" ] }, { @@ -827,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d", "metadata": {}, "outputs": [], @@ -838,7 +679,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c", "metadata": {}, "outputs": [], @@ -909,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "id": "350b09b9-451f-4d47-81fe-f34b892db027", "metadata": {}, "outputs": [], @@ -997,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 19, "id": "0fccc8ef-e575-4857-a401-94a7274394df", "metadata": {}, "outputs": [ @@ -1150,7 +991,7 @@ "4 indiv entrées tp " ] }, - "execution_count": 24, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1162,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 20, "id": "779d8aaf-6668-4f66-8852-847304407ea3", "metadata": {}, "outputs": [ @@ -1332,7 +1173,7 @@ "4 spectacle vivant mucem " ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1344,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 21, "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0", "metadata": {}, "outputs": [ @@ -1443,7 +1284,7 @@ "4 37 383 269 1" ] }, - "execution_count": 26, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1463,7 +1304,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 22, "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba", "metadata": {}, "outputs": [], @@ -1491,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 23, "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951", "metadata": {}, "outputs": [ @@ -1730,7 +1571,7 @@ "[5 rows x 21 columns]" ] }, - "execution_count": 28, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1742,13 +1583,16 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 24, "id": "98f78cd5-b694-4cc6-b033-20170aa13e8d", "metadata": {}, "outputs": [], "source": [ "# Fusion liée au product\n", - "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')" + "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n", + "\n", + "# Selection des variables d'intérêts\n", + "df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]" ] }, { @@ -1759,772 +1603,6 @@ "# Construction des variables explicatives" ] }, - { - "cell_type": "markdown", - "id": "b09c2964-bef9-489e-ad71-84959054531b", - "metadata": {}, - "source": [ - "## Alexis' work" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "id": "4ab1c0d2-0097-4669-b984-b6822c976740", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
event_type_idavg_amount
026.150659
147.762474
254.452618
366.439463
\n", - "
" - ], - "text/plain": [ - " event_type_id avg_amount\n", - "0 2 6.150659\n", - "1 4 7.762474\n", - "2 5 4.452618\n", - "3 6 6.439463" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n", - " .agg({\"amount\" : \"mean\"}).reset_index()\n", - " .rename(columns = {'amount' : 'avg_amount'}))\n", - "\n", - "avg_amount" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idevent_type_idnb_ticketsavg_amount
0123842266.150659
1144532427.762474
2152017504.452618
3162173566.439463
4221436.150659
\n", - "
" - ], - "text/plain": [ - " customer_id event_type_id nb_tickets avg_amount\n", - "0 1 2 384226 6.150659\n", - "1 1 4 453242 7.762474\n", - "2 1 5 201750 4.452618\n", - "3 1 6 217356 6.439463\n", - "4 2 2 143 6.150659" - ] - }, - "execution_count": 143, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n", - " .agg({\"ticket_id\" : \"count\"}).reset_index()\n", - " .rename(columns = {'ticket_id' : 'nb_tickets'})\n", - " .merge(avg_amount, how='left', on='event_type_id'))\n", - "nb_tickets.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "0 12751 NaN 2 False 1 True \n", - "1 12825 NaN 2 False 2 True \n", - "2 11261 NaN 2 False 1 True \n", - "3 13071 NaN 2 False 2 True \n", - "4 653061 NaN 10 False 2 True \n", - "\n", - " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", - "0 True NaN NaN NaN NaN NaN \n", - "1 True NaN NaN NaN NaN NaN \n", - "2 True NaN NaN NaN NaN NaN \n", - "3 True NaN NaN NaN NaN NaN \n", - "4 False NaN NaN NaN NaN NaN \n", - "\n", - " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", - "0 NaN 0 0.0 0 NaN \n", - "1 NaN 0 0.0 0 NaN \n", - "2 NaN 0 0.0 0 NaN \n", - "3 NaN 0 0.0 0 NaN \n", - "4 NaN 0 0.0 0 NaN \n", - "\n", - " average_price_basket average_ticket_basket total_price purchase_count \\\n", - "0 NaN NaN NaN 0 \n", - "1 NaN NaN NaN 0 \n", - "2 NaN NaN NaN 0 \n", - "3 NaN NaN NaN 0 \n", - "4 NaN NaN NaN 0 \n", - "\n", - " first_buying_date country age tenant_id nb_campaigns \\\n", - "0 NaT fr NaN 1311 NaN \n", - "1 NaT fr NaN 1311 NaN \n", - "2 NaT fr NaN 1311 NaN \n", - "3 NaT fr NaN 1311 NaN \n", - "4 NaT NaN NaN 1311 80.0 \n", - "\n", - " nb_campaigns_opened time_to_open \n", - "0 NaN NaT \n", - "1 NaN NaT \n", - "2 NaN NaT \n", - "3 NaN NaT \n", - "4 2.0 0 days 19:53:02.500000 " - ] - }, - "execution_count": 144, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Fusion avec KPI campaigns liés au customer\n", - "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", - "df1_customer.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "id": "a89fad43-ee68-4081-9384-3e9f08ec6a59", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape : (156289, 31)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_openevent_type_idnb_ticketsavg_amount
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000NaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " customer_id birthdate street_id is_partner gender is_email_true \\\n", - "0 12751 NaN 2 False 1 True \n", - "1 12825 NaN 2 False 2 True \n", - "2 11261 NaN 2 False 1 True \n", - "3 13071 NaN 2 False 2 True \n", - "4 653061 NaN 10 False 2 True \n", - "\n", - " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", - "0 True NaN NaN NaN NaN NaN \n", - "1 True NaN NaN NaN NaN NaN \n", - "2 True NaN NaN NaN NaN NaN \n", - "3 True NaN NaN NaN NaN NaN \n", - "4 False NaN NaN NaN NaN NaN \n", - "\n", - " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", - "0 NaN 0 0.0 0 NaN \n", - "1 NaN 0 0.0 0 NaN \n", - "2 NaN 0 0.0 0 NaN \n", - "3 NaN 0 0.0 0 NaN \n", - "4 NaN 0 0.0 0 NaN \n", - "\n", - " average_price_basket average_ticket_basket total_price purchase_count \\\n", - "0 NaN NaN NaN 0 \n", - "1 NaN NaN NaN 0 \n", - "2 NaN NaN NaN 0 \n", - "3 NaN NaN NaN 0 \n", - "4 NaN NaN NaN 0 \n", - "\n", - " first_buying_date country age tenant_id nb_campaigns \\\n", - "0 NaT fr NaN 1311 NaN \n", - "1 NaT fr NaN 1311 NaN \n", - "2 NaT fr NaN 1311 NaN \n", - "3 NaT fr NaN 1311 NaN \n", - "4 NaT NaN NaN 1311 80.0 \n", - "\n", - " nb_campaigns_opened time_to_open event_type_id nb_tickets \\\n", - "0 NaN NaT NaN NaN \n", - "1 NaN NaT NaN NaN \n", - "2 NaN NaT NaN NaN \n", - "3 NaN NaT NaN NaN \n", - "4 2.0 0 days 19:53:02.500000 NaN NaN \n", - "\n", - " avg_amount \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " - ] - }, - "execution_count": 146, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n", - "print(\"shape : \", df1_customer_product.shape)\n", - "df1_customer_product.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "id": "a19fec00-4ece-400c-937c-ce5cd8daccfd", - "metadata": {}, - "outputs": [], - "source": [ - "df1_customer_product.to_csv(\"customer_product.csv\", index = False)" - ] - }, { "cell_type": "markdown", "id": "314f1b7f-ae48-4c6f-8469-9ce879043243", @@ -2535,7 +1613,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 28, "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "metadata": {}, "outputs": [], @@ -2569,7 +1647,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 29, "id": "24537647-bc29-4777-9848-ac4120a4aa60", "metadata": {}, "outputs": [ @@ -2577,7 +1655,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_8302/3700263836.py:11: SettingWithCopyWarning: \n", + "/tmp/ipykernel_42764/3700263836.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -2591,7 +1669,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 30, "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", "metadata": {}, "outputs": [ @@ -2671,7 +1749,7 @@ "4 6 20 0.0 NaT" ] }, - "execution_count": 20, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2688,35 +1766,6 @@ "## KPI tickets" ] }, - { - "cell_type": "code", - "execution_count": 30, - "id": "665a5925-9c0e-425a-8f11-c33a0a9ec444", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['ticket_id', 'product_id', 'is_from_subscription', 'supplier_name',\n", - " 'type_of_ticket_name', 'children', 'purchase_date', 'customer_id',\n", - " 'id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n", - " 'products_group_id', 'product_pack_id', 'event_id',\n", - " 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n", - " 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n", - " 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n", - " 'name_event_types', 'name_facilities'],\n", - " dtype='object')" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1_products_purchased.columns" - ] - }, { "cell_type": "code", "execution_count": 31, @@ -2736,206 +1785,23 @@ } ], "source": [ - "df1_products_purchased['name_event_types'].unique()" + "df1_products_purchased_reduced['name_event_types'].unique()" ] }, { "cell_type": "code", "execution_count": 32, - "id": "e01e8cf9-1187-4a4b-993d-b7b4321cd8f0", - "metadata": {}, - "outputs": [], - "source": [ - "df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "3d8b0875-b409-44ce-b688-d9d6758782d3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ticket_idcustomer_idevent_type_idsupplier_namepurchase_datetype_of_ticket_nameamountchildrenis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasons
013070859481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
113070855481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
213070856481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
313070857481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
413070858481874vente en ligne2018-12-28 14:47:50+00:00Atelier8.0pricing_formulaFalsespectacle vivantmucemindiv prog enfantl'école des magiciens2018
\n", - "
" - ], - "text/plain": [ - " ticket_id customer_id event_type_id supplier_name \\\n", - "0 13070859 48187 4 vente en ligne \n", - "1 13070855 48187 4 vente en ligne \n", - "2 13070856 48187 4 vente en ligne \n", - "3 13070857 48187 4 vente en ligne \n", - "4 13070858 48187 4 vente en ligne \n", - "\n", - " purchase_date type_of_ticket_name amount children \\\n", - "0 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", - "1 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", - "2 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", - "3 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", - "4 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n", - "\n", - " is_full_price name_event_types name_facilities name_categories \\\n", - "0 False spectacle vivant mucem indiv prog enfant \n", - "1 False spectacle vivant mucem indiv prog enfant \n", - "2 False spectacle vivant mucem indiv prog enfant \n", - "3 False spectacle vivant mucem indiv prog enfant \n", - "4 False spectacle vivant mucem indiv prog enfant \n", - "\n", - " name_events name_seasons \n", - "0 l'école des magiciens 2018 \n", - "1 l'école des magiciens 2018 \n", - "2 l'école des magiciens 2018 \n", - "3 l'école des magiciens 2018 \n", - "4 l'école des magiciens 2018 " - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importance des suppliers\n", - "df1_products_purchased_reduced.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe", "metadata": {}, "outputs": [], "source": [ "# Nombre de client assistant à plus de 2 type d'événement\n", - "nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()\n" + "nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 33, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], @@ -2961,6 +1827,7 @@ " 'purchase_date' : ['min', 'max']})\n", " .reset_index()\n", " )\n", + " \n", " tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n", " \n", " tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n", @@ -2980,7 +1847,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 34, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -2998,7 +1865,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 35, "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5", "metadata": {}, "outputs": [], @@ -3014,7 +1881,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 36, "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad", "metadata": {}, "outputs": [ @@ -3144,7 +2011,7 @@ "5032 1049 days 18:46:12 13497.0 " ] }, - "execution_count": 84, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -3153,6 +2020,641 @@ "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(5)" ] }, + { + "cell_type": "markdown", + "id": "f1d7f7ba-361b-467d-b375-b09c149185f7", + "metadata": {}, + "source": [ + "## Alexis' work" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4ab1c0d2-0097-4669-b984-b6822c976740", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_type_idavg_amount
026.150659
147.762474
254.452618
366.439463
\n", + "
" + ], + "text/plain": [ + " event_type_id avg_amount\n", + "0 2 6.150659\n", + "1 4 7.762474\n", + "2 5 4.452618\n", + "3 6 6.439463" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n", + " .agg({\"amount\" : \"mean\"}).reset_index()\n", + " .rename(columns = {'amount' : 'avg_amount'}))\n", + "\n", + "avg_amount" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idevent_type_idnb_ticketsavg_amount
0123842266.150659
1144532427.762474
2152017504.452618
3162173566.439463
4221436.150659
\n", + "
" + ], + "text/plain": [ + " customer_id event_type_id nb_tickets avg_amount\n", + "0 1 2 384226 6.150659\n", + "1 1 4 453242 7.762474\n", + "2 1 5 201750 4.452618\n", + "3 1 6 217356 6.439463\n", + "4 2 2 143 6.150659" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n", + " .agg({\"ticket_id\" : \"count\"}).reset_index()\n", + " .rename(columns = {'ticket_id' : 'nb_tickets'})\n", + " .merge(avg_amount, how='left', on='event_type_id'))\n", + "nb_tickets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguage...average_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaN...NaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaN...NaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language ... average_ticket_basket \\\n", + "0 True NaN NaN NaN ... NaN \n", + "1 True NaN NaN NaN ... NaN \n", + "2 True NaN NaN NaN ... NaN \n", + "3 True NaN NaN NaN ... NaN \n", + "4 False NaN NaN NaN ... NaN \n", + "\n", + " total_price purchase_count first_buying_date country age tenant_id \\\n", + "0 NaN 0 NaT fr NaN 1311 \n", + "1 NaN 0 NaT fr NaN 1311 \n", + "2 NaN 0 NaT fr NaN 1311 \n", + "3 NaN 0 NaT fr NaN 1311 \n", + "4 NaN 0 NaT NaN NaN 1311 \n", + "\n", + " nb_campaigns nb_campaigns_opened time_to_open \n", + "0 NaN NaN NaT \n", + "1 NaN NaN NaT \n", + "2 NaN NaN NaT \n", + "3 NaN NaN NaT \n", + "4 80.0 2.0 0 days 19:53:02.500000 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fusion avec KPI campaigns liés au customer\n", + "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", + "df1_customer.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "a89fad43-ee68-4081-9384-3e9f08ec6a59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (156289, 31)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguage...first_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_openevent_type_idnb_ticketsavg_amount
012751NaN2False1TrueTrueNaNNaNNaN...NaTfrNaN1311NaNNaNNaTNaNNaNNaN
112825NaN2False2TrueTrueNaNNaNNaN...NaTfrNaN1311NaNNaNNaTNaNNaNNaN
211261NaN2False1TrueTrueNaNNaNNaN...NaTfrNaN1311NaNNaNNaTNaNNaNNaN
313071NaN2False2TrueTrueNaNNaNNaN...NaTfrNaN1311NaNNaNNaTNaNNaNNaN
4653061NaN10False2TrueFalseNaNNaNNaN...NaTNaNNaN131180.02.00 days 19:53:02.500000NaNNaNNaN
\n", + "

5 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language ... first_buying_date country \\\n", + "0 True NaN NaN NaN ... NaT fr \n", + "1 True NaN NaN NaN ... NaT fr \n", + "2 True NaN NaN NaN ... NaT fr \n", + "3 True NaN NaN NaN ... NaT fr \n", + "4 False NaN NaN NaN ... NaT NaN \n", + "\n", + " age tenant_id nb_campaigns nb_campaigns_opened time_to_open \\\n", + "0 NaN 1311 NaN NaN NaT \n", + "1 NaN 1311 NaN NaN NaT \n", + "2 NaN 1311 NaN NaN NaT \n", + "3 NaN 1311 NaN NaN NaT \n", + "4 NaN 1311 80.0 2.0 0 days 19:53:02.500000 \n", + "\n", + " event_type_id nb_tickets avg_amount \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n", + "print(\"shape : \", df1_customer_product.shape)\n", + "df1_customer_product.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a19fec00-4ece-400c-937c-ce5cd8daccfd", + "metadata": {}, + "outputs": [], + "source": [ + "# df1_customer_product.to_csv(\"customer_product.csv\", index = False)" + ] + }, { "cell_type": "markdown", "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7", @@ -3163,7 +2665,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], @@ -3174,7 +2676,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [],