From 15ddb5463f25d61f322e44fdc8aade55d937456e Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Sat, 13 Jan 2024 13:47:24 +0000 Subject: [PATCH] Exploration secteur billettique et fusion en une base --- Exploration_billet_AJ.ipynb | 4526 ++++++++++------------------------- 1 file changed, 1215 insertions(+), 3311 deletions(-) diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index f8931c2..d5db714 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", "metadata": {}, "outputs": [ @@ -79,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n" + "['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n" ] } ], @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 5, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ @@ -103,7 +103,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_15896/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -124,6 +124,370 @@ " globals()[nom_dataframe] = df" ] }, + { + "cell_type": "markdown", + "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f", + "metadata": {}, + "source": [ + "## tickets.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
013070859135930026612882021-12-28 20:47:10.320641+01:002022-02-14 18:46:53.614229+01:005107462225251False13NaNb6ad7fc36f33b5e05f58c7fca06688a6
113070860135930026613992021-12-28 20:47:10.321037+01:002022-02-14 18:46:53.614761+01:005107462224914False13NaNb0903af480266f27802fe5c38c277c9e
213070861135930026614192021-12-28 20:47:10.321629+01:002022-02-14 18:46:53.615521+01:005107462224914False13NaN64ca12b7e26a65b90335c0702ea0faba
313070862135930026615082021-12-28 20:47:10.322029+01:002022-02-14 18:46:53.616000+01:005107462224914False13NaN5ac2f8150aa9f3a6b1599df08cc2f0c7
413070863135930026616892021-12-28 20:47:10.322449+01:002022-02-14 18:46:53.616447+01:005107462224914False13NaNdfe30081bae020d12094279926136b9c
....................................
182666720662815135930161543902023-11-09 07:51:34.935983+01:002023-11-09 07:51:34.935983+01:008007697405689False13NaNdba9aa428f843b79ae69dfacfe8fc579
182666820662816135930161545012023-11-09 07:51:34.937038+01:002023-11-09 07:51:34.937038+01:008007698403658False13NaN93f1fcfc6ba4fa68f92eb4b4a619fcf0
182666920662817135930161546802023-11-09 07:51:34.938224+01:002023-11-09 07:51:34.938224+01:008007698403658False13NaNc8bbbd25df2c158767ceef42c3237f23
182667020662818135930161548992023-11-09 07:51:34.939328+01:002023-11-09 07:51:34.939328+01:008007699403658False13NaN738f0a8b5088b5056bc3b32eff2dca1f
182667120662819135930161549882023-11-09 07:51:34.940680+01:002023-11-09 07:51:34.940680+01:008007699403658False13NaN4c5a6195434377380b4e6ae63b2e9cf6
\n", + "

1826672 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id number created_at \\\n", + "0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n", + "1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n", + "2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n", + "3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n", + "4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n", + "... ... ... ... \n", + "1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n", + "1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n", + "1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n", + "1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n", + "1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n", + "\n", + " updated_at purchase_id product_id \\\n", + "0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n", + "1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n", + "2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n", + "3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n", + "4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n", + "... ... ... ... \n", + "1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n", + "1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n", + "1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n", + "1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n", + "1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n", + "\n", + " is_from_subscription type_of supplier_id barcode \\\n", + "0 False 1 3 NaN \n", + "1 False 1 3 NaN \n", + "2 False 1 3 NaN \n", + "3 False 1 3 NaN \n", + "4 False 1 3 NaN \n", + "... ... ... ... ... \n", + "1826667 False 1 3 NaN \n", + "1826668 False 1 3 NaN \n", + "1826669 False 1 3 NaN \n", + "1826670 False 1 3 NaN \n", + "1826671 False 1 3 NaN \n", + "\n", + " identifier \n", + "0 b6ad7fc36f33b5e05f58c7fca06688a6 \n", + "1 b0903af480266f27802fe5c38c277c9e \n", + "2 64ca12b7e26a65b90335c0702ea0faba \n", + "3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n", + "4 dfe30081bae020d12094279926136b9c \n", + "... ... \n", + "1826667 dba9aa428f843b79ae69dfacfe8fc579 \n", + "1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n", + "1826669 c8bbbd25df2c158767ceef42c3237f23 \n", + "1826670 738f0a8b5088b5056bc3b32eff2dca1f \n", + "1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n", + "\n", + "[1826672 rows x 11 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_tickets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1826672 entries, 0 to 1826671\n", + "Data columns (total 11 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 id int64 \n", + " 1 number object \n", + " 2 created_at object \n", + " 3 updated_at object \n", + " 4 purchase_id int64 \n", + " 5 product_id int64 \n", + " 6 is_from_subscription bool \n", + " 7 type_of int64 \n", + " 8 supplier_id int64 \n", + " 9 barcode float64\n", + " 10 identifier object \n", + "dtypes: bool(1), float64(1), int64(5), object(4)\n", + "memory usage: 141.1+ MB\n" + ] + } + ], + "source": [ + "df1_tickets.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c1b42769-03c7-4785-92ce-5e1e6b41908d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "number 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "purchase_id 0.0\n", + "product_id 0.0\n", + "is_from_subscription 0.0\n", + "type_of 0.0\n", + "supplier_id 0.0\n", + "barcode 100.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_tickets.isna().sum()/len(df1_tickets)*100" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "42896791-2d93-4725-a50b-6c7cbe535ec7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_15896/232847087.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", + "df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)" + ] + }, { "cell_type": "markdown", "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", @@ -134,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "id": "2e0dada0-9457-484c-aa55-77e44613ecca", "metadata": {}, "outputs": [ @@ -318,19 +682,18 @@ "8 NaN 11c6d471fa4e354e62e684d293694202 " ] }, - "execution_count": 18, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n", "df1_suppliers" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "id": "b583be02-ab60-4e14-9325-0204f203a1af", "metadata": {}, "outputs": [ @@ -363,21 +726,409 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 0.0\n", + "manually_added 0.0\n", + "label 100.0\n", + "itr 100.0\n", + "updated_at 0.0\n", + "created_at 0.0\n", + "commission 100.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df1_suppliers.isna().mean()*100" + "df1_suppliers.isna().sum()/len(df1_suppliers)*100" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_15896/302783287.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n", + "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4de7e2e2-6da4-4618-8444-b524399c5493", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsupplier_name
01617j4 administration
18non défini
24vad
31fort saint jean
42j4
55revendeur
63vente en ligne
76ccr
87dab
\n", + "
" + ], + "text/plain": [ + " id supplier_name\n", + "0 1617 j4 administration\n", + "1 8 non défini\n", + "2 4 vad\n", + "3 1 fort saint jean\n", + "4 2 j4\n", + "5 5 revendeur\n", + "6 3 vente en ligne\n", + "7 6 ccr\n", + "8 7 dab" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers_clean" + ] + }, + { + "cell_type": "markdown", + "id": "0a6df975-c7fc-45bc-92af-a0bdab17d795", + "metadata": {}, + "source": [ + "## type_ofs.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamechildrencreated_atupdated_atidentifier
01Atelierpricing_formula2021-01-05 11:55:51.188106+01:002021-01-05 11:55:51.188106+01:00623ec4067827558b28972cf39fe81ee7
12Billet en nombrepricing_formula2021-01-11 12:13:19.286301+01:002021-01-11 12:13:19.286301+01:00a53d313a97296ee37caa066dbfe7a45c
23Groupepricing_formula2021-01-11 12:19:22.842917+01:002021-01-11 12:19:22.842917+01:001ab143efc3b85acbbc752fe8eb2b0b86
34Revendeurpricing_formula2021-01-12 12:34:20.481236+01:002021-01-12 12:34:20.481236+01:008b332723366a07e1eef5f1c92f9ae067
45Cinéma scolairepricing_formula2021-01-25 19:16:05.141719+01:002021-01-25 19:16:05.141719+01:00a12e62cb4c4f47e7406bd8fbff2bfe30
56Musée famillepricing_formula2021-01-25 19:23:06.692627+01:002021-01-25 19:23:06.692627+01:001ec6c19283111ccb3ed67f52d414470e
67Spectacle famillepricing_formula2021-01-25 19:28:21.390016+01:002021-01-25 19:28:21.390016+01:0005e2104f1b74ced229c06847d6e91938
78Masterclasspricing_formula2021-01-25 19:31:05.076904+01:002021-01-25 19:31:05.076904+01:009cc946edfb25e11b4282f58db16e6ae9
89Spectaclepricing_formula2021-01-25 19:38:41.260535+01:002021-01-25 19:38:41.260535+01:00d88321c347f0e0ab101184cdf25c94bf
910Cinemapricing_formula2021-02-05 11:12:31.932576+01:002021-02-05 11:12:31.932576+01:000870fef2bfcd5b30a12e4f5c7f4aaba7
1011Museepricing_formula2021-02-05 11:52:05.468207+01:002021-02-05 11:52:05.468207+01:008ba8934454cc62c7cdb3eb6e1b39df0c
1112Tarifs pleincategory2023-03-13 11:31:50.528331+01:002023-03-13 11:31:50.528331+01:00a6969df76efc15d157be48e87a7bcf9a
\n", + "
" + ], + "text/plain": [ + " id name children created_at \\\n", + "0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n", + "1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n", + "2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n", + "3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n", + "4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n", + "5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n", + "6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n", + "7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n", + "8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n", + "9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n", + "10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n", + "11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n", + "1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n", + "2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n", + "3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n", + "4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n", + "5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n", + "6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n", + "7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n", + "8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n", + "9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n", + "10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n", + "11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_type_ofs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12 entries, 0 to 11\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 12 non-null int64 \n", + " 1 name 12 non-null object\n", + " 2 children 12 non-null object\n", + " 3 created_at 12 non-null object\n", + " 4 updated_at 12 non-null object\n", + " 5 identifier 12 non-null object\n", + "dtypes: int64(1), object(5)\n", + "memory usage: 704.0+ bytes\n" + ] + } + ], + "source": [ + "df1_type_ofs.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_15896/81842251.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n", + "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)" + ] }, { "cell_type": "markdown", @@ -389,3195 +1140,11 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd", - "metadata": {}, - "source": [ - "## tickets.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", - "metadata": {}, - "outputs": [], - "source": [ - "df1_purchases\n", - "df1_tickets" - ] - }, - { - "cell_type": "markdown", - "id": "355f5489-7904-4161-a85b-6eb70b3a4c89", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# Fusion et exploration" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "afe548fe-d93c-4634-9f53-881404ec4c6c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_xpurchase_datetype_ofis_from_subscriptionamountis_full_pricestart_date_timeevent_name
09924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
19924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
210539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
310539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
411891412020-11-26 13:12:53+01:003False51.3False2020-12-01 20:00:00+01:00iphigenie en tauride
...........................
31896410908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896510908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896610908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896712442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
31896812442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
\n", - "

318969 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " id_x purchase_date type_of is_from_subscription \\\n", - "0 992423 2023-01-11 17:08:41+01:00 3 False \n", - "1 992423 2023-01-11 17:08:41+01:00 3 False \n", - "2 1053934 2023-03-16 16:23:10+01:00 3 False \n", - "3 1053934 2023-03-16 16:23:10+01:00 3 False \n", - "4 1189141 2020-11-26 13:12:53+01:00 3 False \n", - "... ... ... ... ... \n", - "318964 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318965 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318966 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318967 1244277 2019-12-31 11:04:07+01:00 1 False \n", - "318968 1244277 2019-12-31 11:04:07+01:00 1 False \n", - "\n", - " amount is_full_price start_date_time event_name \n", - "0 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", - "1 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", - "2 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", - "3 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", - "4 51.3 False 2020-12-01 20:00:00+01:00 iphigenie en tauride \n", - "... ... ... ... ... \n", - "318964 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318965 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318966 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318967 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", - "318968 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", - "\n", - "[318969 rows x 8 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Jointure\n", - "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", - "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", - "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('representation_id')\n", - "var_choosed.extend(['start_date_time', 'event_id'])\n", - "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('event_id')\n", - "var_choosed.extend(['name', 'customer_id'])\n", - "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", - "\n", - "# Changement de nom\n", - "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", - "var_choosed[var_choosed.index('name')] = \"event_name\"\n", - "\n", - "# Base finale\n", - "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", - "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", - "df_customer_event" - ] - }, - { - "cell_type": "markdown", - "id": "779da86b-ac61-4c61-88d2-fa1c0c19efce", - "metadata": {}, - "source": [ - "## Type de client au globale" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n", - " 'extra_field'],\n", - " dtype='object')\n", - "(124302, 7)\n", - "\n", - "RangeIndex: 124302 entries, 0 to 124301\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 124302 non-null int64 \n", - " 1 customer_id 124302 non-null int64 \n", - " 2 target_id 124302 non-null int64 \n", - " 3 created_at 124296 non-null object \n", - " 4 updated_at 124296 non-null object \n", - " 5 name 0 non-null float64\n", - " 6 extra_field 0 non-null float64\n", - "dtypes: float64(2), int64(3), object(2)\n", - "memory usage: 6.6+ MB\n" - ] - } - ], - "source": [ - "# Client\n", - "print(customer_target_mappings.columns)\n", - "print(customer_target_mappings.shape)\n", - "customer_target_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customer_target_mappings['extra_field'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "47bc8453-0693-4838-8bd8-4d800a82c496", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan])" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customer_target_mappings['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", - "(4, 6)\n", - "\n", - "RangeIndex: 4 entries, 0 to 3\n", - "Data columns (total 6 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 4 non-null int64 \n", - " 1 is_import 4 non-null bool \n", - " 2 name 4 non-null object\n", - " 3 created_at 4 non-null object\n", - " 4 updated_at 4 non-null object\n", - " 5 identifier 4 non-null object\n", - "dtypes: bool(1), int64(1), object(4)\n", - "memory usage: 292.0+ bytes\n" - ] - } - ], - "source": [ - "# Segmentation existante\n", - "print(target_types.columns)\n", - "print(target_types.shape)\n", - "target_types.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5", + "execution_count": 20, + "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idis_importnamecreated_atupdated_atidentifier
01Falsemanual_static_filter2021-04-29 13:42:14.111085+02:002021-04-29 13:42:14.111085+02:00fb27e81baa4debc6a4e1a8639c20e808
13Truemanual_structure2021-05-07 15:20:00.626650+02:002021-05-07 15:20:00.626650+02:00382bca214204a2d3462f5ec2728d5d1e
26Falsemanual_dynamic_filter2021-09-09 14:27:47.641302+02:002021-09-09 14:27:47.641302+02:00e0f4b8693184850fefd6d2a38f10584e
32Truemanual_import2021-04-29 13:49:30.107110+02:002021-04-29 13:49:30.107110+02:0012213df2ce68a624e4c0070521437bac
\n", - "
" - ], - "text/plain": [ - " id is_import name created_at \\\n", - "0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n", - "1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n", - "2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n", - "3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n", - "\n", - " updated_at identifier \n", - "0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n", - "1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n", - "2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n", - "3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_types" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "8dd74e87-97c2-493d-b19f-971b684078d3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", - "(20, 5)\n", - "\n", - "RangeIndex: 20 entries, 0 to 19\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 20 non-null int64 \n", - " 1 name 19 non-null object\n", - " 2 created_at 20 non-null object\n", - " 3 updated_at 20 non-null object\n", - " 4 identifier 20 non-null object\n", - "dtypes: int64(1), object(4)\n", - "memory usage: 928.0+ bytes\n" - ] - } - ], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tags = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tags.columns)\n", - "print(tags.shape)\n", - "tags.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "91d54732-666c-4250-ba91-5c9b83d4712a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamecreated_atupdated_atidentifier
02ens-écoles2021-05-07 15:24:19.808501+02:002021-05-07 15:24:19.808501+02:00b6a360c5f84595940c5774f13fd39cc3
11NaN2021-05-07 15:24:19.805589+02:002021-05-07 15:24:19.805589+02:00d41d8cd98f00b204e9800998ecf8427e
24ecoles primaires rennes2021-05-07 15:29:06.388415+02:002021-05-07 15:29:06.388415+02:00ca8649dd64c240d118f60b07d11a7053
35Angers Nantes Opéra2023-01-27 15:59:58.187557+01:002023-01-27 15:59:58.187557+01:00f8f500f937fe312542399299cdc13f7e
46Opéras2023-01-27 16:03:59.654938+01:002023-01-27 16:03:59.654938+01:0022eb2c616983ec7b54a093f84b230505
57Ministère de la Culture2023-01-30 11:22:29.636813+01:002023-01-30 11:22:29.636813+01:001b8c5c08fde000d90905a3d14af7763d
68Orchestres2023-01-30 11:33:56.392799+01:002023-01-30 11:33:56.392799+01:007c2aee0c80642d7e325a450f2dec45e5
79Cooperative2023-01-31 14:44:38.471146+01:002023-01-31 14:44:38.471146+01:006c88c36ffaab88d255865aa3111d7686
810Théâtres2023-01-31 14:45:17.804428+01:002023-01-31 14:45:17.804428+01:00b2c19672df82021702b79482c8cda85a
911La co[opera]tive2023-02-16 17:11:35.004478+01:002023-02-16 17:11:35.004478+01:005dbaa3a1f278c0fcf981d447ad20957a
1012Ville de Rennes2023-02-16 17:37:13.816196+01:002023-02-16 17:37:13.816196+01:00bc483d04d9c3a08f167a3ce64366ca72
1113Ensembles en résidence2023-02-16 17:55:54.877374+01:002023-02-16 17:55:54.877374+01:00e70635e771de13268dccf02bb2abfaf9
1214Ministère2023-02-17 11:17:54.429462+01:002023-02-17 11:17:54.429462+01:00a3f0582853fd19f5b57e3651f8a20e7a
1315Rennes métropole2023-02-17 11:53:24.490786+01:002023-02-17 11:53:24.490786+01:00e98b8db5941b96c29c353b6f2f502055
1416Ville de Rennes - équipements culturels2023-02-17 12:00:10.649104+01:002023-02-17 12:00:10.649104+01:00a44edffc7edb852982efa7f4aa6d0e25
1517Structures culturelles rennaises2023-02-17 12:05:55.583016+01:002023-02-17 12:05:55.583016+01:00241550517e4e3b1c926e9aeab0f621cd
1618Université Rennes 22023-02-17 14:23:44.832959+01:002023-02-17 14:23:44.832959+01:004057c5cee51c4e10aa819f0cf48adc3f
1719Centres chorégraphiques nationaux2023-02-17 15:29:41.827321+01:002023-02-17 15:29:41.827321+01:0041e75941dfb766365498d917abe0102f
1820Télévision2023-02-17 15:46:13.746092+01:002023-02-17 15:46:13.746092+01:0036d6409c539dd79c1f3af8c5948603eb
1921structures culturelles nationales2023-02-17 15:56:00.555722+01:002023-02-17 15:56:00.555722+01:005311cf7e42aac53289e1c4a338d5cfa4
\n", - "
" - ], - "text/plain": [ - " id name \\\n", - "0 2 ens-écoles \n", - "1 1 NaN \n", - "2 4 ecoles primaires rennes \n", - "3 5 Angers Nantes Opéra \n", - "4 6 Opéras \n", - "5 7 Ministère de la Culture \n", - "6 8 Orchestres \n", - "7 9 Cooperative \n", - "8 10 Théâtres \n", - "9 11 La co[opera]tive \n", - "10 12 Ville de Rennes \n", - "11 13 Ensembles en résidence \n", - "12 14 Ministère \n", - "13 15 Rennes métropole \n", - "14 16 Ville de Rennes - équipements culturels \n", - "15 17 Structures culturelles rennaises \n", - "16 18 Université Rennes 2 \n", - "17 19 Centres chorégraphiques nationaux \n", - "18 20 Télévision \n", - "19 21 structures culturelles nationales \n", - "\n", - " created_at updated_at \\\n", - "0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n", - "1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n", - "2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n", - "3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n", - "4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n", - "5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n", - "6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n", - "7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n", - "8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n", - "9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n", - "10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n", - "11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n", - "12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n", - "13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n", - "14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n", - "15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n", - "16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n", - "17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n", - "18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n", - "19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n", - "\n", - " identifier \n", - "0 b6a360c5f84595940c5774f13fd39cc3 \n", - "1 d41d8cd98f00b204e9800998ecf8427e \n", - "2 ca8649dd64c240d118f60b07d11a7053 \n", - "3 f8f500f937fe312542399299cdc13f7e \n", - "4 22eb2c616983ec7b54a093f84b230505 \n", - "5 1b8c5c08fde000d90905a3d14af7763d \n", - "6 7c2aee0c80642d7e325a450f2dec45e5 \n", - "7 6c88c36ffaab88d255865aa3111d7686 \n", - "8 b2c19672df82021702b79482c8cda85a \n", - "9 5dbaa3a1f278c0fcf981d447ad20957a \n", - "10 bc483d04d9c3a08f167a3ce64366ca72 \n", - "11 e70635e771de13268dccf02bb2abfaf9 \n", - "12 a3f0582853fd19f5b57e3651f8a20e7a \n", - "13 e98b8db5941b96c29c353b6f2f502055 \n", - "14 a44edffc7edb852982efa7f4aa6d0e25 \n", - "15 241550517e4e3b1c926e9aeab0f621cd \n", - "16 4057c5cee51c4e10aa819f0cf48adc3f \n", - "17 41e75941dfb766365498d917abe0102f \n", - "18 36d6409c539dd79c1f3af8c5948603eb \n", - "19 5311cf7e42aac53289e1c4a338d5cfa4 " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n", - "(179, 5)\n", - "\n", - "RangeIndex: 179 entries, 0 to 178\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 179 non-null int64 \n", - " 1 structure_id 179 non-null int64 \n", - " 2 tag_id 179 non-null int64 \n", - " 3 created_at 179 non-null object\n", - " 4 updated_at 179 non-null object\n", - "dtypes: int64(3), object(2)\n", - "memory usage: 7.1+ KB\n" - ] - } - ], - "source": [ - "# Structure = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(structure_tag_mappings.columns)\n", - "print(structure_tag_mappings.shape)\n", - "structure_tag_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idstructure_idtag_idcreated_atupdated_at
012318762023-01-27 16:03:59.680222+01:002023-01-27 16:03:59.680222+01:00
12222021-05-07 15:24:19.872895+02:002021-05-07 15:24:19.872895+02:00
23322021-05-07 15:24:19.873830+02:002021-05-07 15:24:19.873830+02:00
34422021-05-07 15:24:19.874628+02:002021-05-07 15:24:19.874628+02:00
45522021-05-07 15:24:19.875421+02:002021-05-07 15:24:19.875421+02:00
..................
174184236102023-02-17 16:35:25.041114+01:002023-02-17 16:35:25.041114+01:00
175185237172023-02-17 16:39:10.799478+01:002023-02-17 16:39:10.799478+01:00
176186238192023-02-17 16:53:21.098690+01:002023-02-17 16:53:21.098690+01:00
177187239102023-02-17 16:57:42.623481+01:002023-02-17 16:57:42.623481+01:00
178188240102023-02-17 16:59:22.067723+01:002023-02-17 16:59:22.067723+01:00
\n", - "

179 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " id structure_id tag_id created_at \\\n", - "0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n", - "1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n", - "2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n", - "3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n", - "4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n", - ".. ... ... ... ... \n", - "174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n", - "175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n", - "176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n", - "177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n", - "178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n", - "\n", - " updated_at \n", - "0 2023-01-27 16:03:59.680222+01:00 \n", - "1 2021-05-07 15:24:19.872895+02:00 \n", - "2 2021-05-07 15:24:19.873830+02:00 \n", - "3 2021-05-07 15:24:19.874628+02:00 \n", - "4 2021-05-07 15:24:19.875421+02:00 \n", - ".. ... \n", - "174 2023-02-17 16:35:25.041114+01:00 \n", - "175 2023-02-17 16:39:10.799478+01:00 \n", - "176 2023-02-17 16:53:21.098690+01:00 \n", - "177 2023-02-17 16:57:42.623481+01:00 \n", - "178 2023-02-17 16:59:22.067723+01:00 \n", - "\n", - "[179 rows x 5 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "structure_tag_mappings" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "41bf1529-5a7c-409e-9791-2024c08c11f0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", - " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", - " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", - " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", - " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", - " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", - " 'average_purchase_delay', 'average_price_basket',\n", - " 'average_ticket_basket', 'total_price', 'preferred_category',\n", - " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", - " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", - " 'tenant_id'],\n", - " dtype='object')\n", - "(71307, 43)\n", - "\n", - "RangeIndex: 71307 entries, 0 to 71306\n", - "Data columns (total 43 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 71307 non-null int64 \n", - " 1 lastname 41045 non-null object \n", - " 2 firstname 39140 non-null object \n", - " 3 birthdate 18174 non-null object \n", - " 4 email 58203 non-null object \n", - " 5 street_id 71307 non-null int64 \n", - " 6 created_at 71307 non-null object \n", - " 7 updated_at 71307 non-null object \n", - " 8 civility 0 non-null float64\n", - " 9 is_partner 71307 non-null bool \n", - " 10 extra 0 non-null float64\n", - " 11 deleted_at 0 non-null float64\n", - " 12 reference 0 non-null float64\n", - " 13 gender 71307 non-null int64 \n", - " 14 is_email_true 71307 non-null bool \n", - " 15 extra_field 0 non-null float64\n", - " 16 identifier 71307 non-null object \n", - " 17 opt_in 71307 non-null bool \n", - " 18 structure_id 616 non-null float64\n", - " 19 note 451 non-null object \n", - " 20 profession 812 non-null object \n", - " 21 language 0 non-null float64\n", - " 22 mcp_contact_id 22417 non-null float64\n", - " 23 need_reload 71307 non-null bool \n", - " 24 last_buying_date 34040 non-null object \n", - " 25 max_price 34040 non-null float64\n", - " 26 ticket_sum 71307 non-null int64 \n", - " 27 average_price 68694 non-null float64\n", - " 28 fidelity 71307 non-null int64 \n", - " 29 average_purchase_delay 34040 non-null float64\n", - " 30 average_price_basket 34040 non-null float64\n", - " 31 average_ticket_basket 34040 non-null float64\n", - " 32 total_price 36653 non-null float64\n", - " 33 preferred_category 0 non-null float64\n", - " 34 preferred_supplier 0 non-null float64\n", - " 35 preferred_formula 0 non-null float64\n", - " 36 purchase_count 71307 non-null int64 \n", - " 37 first_buying_date 34040 non-null object \n", - " 38 last_visiting_date 0 non-null float64\n", - " 39 zipcode 33756 non-null object \n", - " 40 country 39910 non-null object \n", - " 41 age 18174 non-null float64\n", - " 42 tenant_id 71307 non-null int64 \n", - "dtypes: bool(4), float64(19), int64(7), object(13)\n", - "memory usage: 21.5+ MB\n" - ] - } - ], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customersplus = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(customersplus.columns)\n", - "print(customersplus.shape)\n", - "customersplus.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0286834lastname286834firstname286834NaNemail28683462022-05-19 10:09:09.361137+02:002022-05-19 10:09:09.361137+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN1556
1330695NaNNaNNaNemail33069512022-07-16 04:10:34.135134+02:002022-07-16 04:10:34.156704+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
2330978NaNNaNNaNemail33097812022-07-21 22:14:09.811721+02:002022-07-21 22:14:09.836051+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
3338697NaNNaNNaNemail33869712022-09-15 19:02:03.950536+02:002022-09-15 19:02:03.985642+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
4338726NaNNaNNaNemail33872612022-09-16 01:24:40.719882+02:002022-09-16 01:24:40.742753+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
..................................................................
7130227105lastname27105firstname271051957-01-26email271052050242021-04-22 15:12:59.986534+02:002023-09-12 18:59:31.613235+02:00NaNFalse...NaNNaNNaN22018-12-31 18:56:57+01:00NaN35700fr66.01556
7130327108lastname27108firstname27108NaNNaN2050242021-04-22 15:12:59.989197+02:002023-09-12 18:27:34.380843+02:00NaNFalse...NaNNaNNaN62015-12-29 14:51:46+01:00NaN35700frNaN1556
7130427110lastname27110firstname27110NaNNaN62021-04-22 15:12:59.991029+02:002022-04-14 11:41:33.738500+02:00NaNFalse...NaNNaNNaN12018-12-31 19:12:59+01:00NaNNaNfrNaN1556
7130510607lastname10607firstname106071963-01-04email106073133322021-04-22 14:56:45.742226+02:002023-09-12 17:55:17.723195+02:00NaNFalse...NaNNaNNaN262015-10-10 14:11:21+02:00NaN35850fr60.01556
7130619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...NaNNaNNaN22019-05-19 21:18:36+02:00NaNNaNfr44.01556
\n", - "

71307 rows × 43 columns

\n", - "
" - ], - "text/plain": [ - " id lastname firstname birthdate email \\\n", - "0 286834 lastname286834 firstname286834 NaN email286834 \n", - "1 330695 NaN NaN NaN email330695 \n", - "2 330978 NaN NaN NaN email330978 \n", - "3 338697 NaN NaN NaN email338697 \n", - "4 338726 NaN NaN NaN email338726 \n", - "... ... ... ... ... ... \n", - "71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n", - "71303 27108 lastname27108 firstname27108 NaN NaN \n", - "71304 27110 lastname27110 firstname27110 NaN NaN \n", - "71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n", - "71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "\n", - " street_id created_at \\\n", - "0 6 2022-05-19 10:09:09.361137+02:00 \n", - "1 1 2022-07-16 04:10:34.135134+02:00 \n", - "2 1 2022-07-21 22:14:09.811721+02:00 \n", - "3 1 2022-09-15 19:02:03.950536+02:00 \n", - "4 1 2022-09-16 01:24:40.719882+02:00 \n", - "... ... ... \n", - "71302 205024 2021-04-22 15:12:59.986534+02:00 \n", - "71303 205024 2021-04-22 15:12:59.989197+02:00 \n", - "71304 6 2021-04-22 15:12:59.991029+02:00 \n", - "71305 313332 2021-04-22 14:56:45.742226+02:00 \n", - "71306 6 2021-04-22 15:06:30.120537+02:00 \n", - "\n", - " updated_at civility is_partner ... \\\n", - "0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n", - "1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n", - "2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n", - "3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n", - "4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n", - "... ... ... ... ... \n", - "71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n", - "71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n", - "71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", - "71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n", - "71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "\n", - " preferred_category preferred_supplier preferred_formula \\\n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "71302 NaN NaN NaN \n", - "71303 NaN NaN NaN \n", - "71304 NaN NaN NaN \n", - "71305 NaN NaN NaN \n", - "71306 NaN NaN NaN \n", - "\n", - " purchase_count first_buying_date last_visiting_date zipcode \\\n", - "0 0 NaN NaN NaN \n", - "1 0 NaN NaN NaN \n", - "2 0 NaN NaN NaN \n", - "3 0 NaN NaN NaN \n", - "4 0 NaN NaN NaN \n", - "... ... ... ... ... \n", - "71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n", - "71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n", - "71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n", - "71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n", - "71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n", - "\n", - " country age tenant_id \n", - "0 fr NaN 1556 \n", - "1 NaN NaN 1556 \n", - "2 NaN NaN 1556 \n", - "3 NaN NaN 1556 \n", - "4 NaN NaN 1556 \n", - "... ... ... ... \n", - "71302 fr 66.0 1556 \n", - "71303 fr NaN 1556 \n", - "71304 fr NaN 1556 \n", - "71305 fr 60.0 1556 \n", - "71306 fr 44.0 1556 \n", - "\n", - "[71307 rows x 43 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customersplus" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2", - "metadata": {}, - "outputs": [], - "source": [ - "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "8259ae6c-353f-43a6-add3-f974fac6e5d4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n", - " 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(318969, 11)\n", - "\n", - "RangeIndex: 318969 entries, 0 to 318968\n", - "Data columns (total 11 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 318969 non-null int64 \n", - " 1 number 318969 non-null object \n", - " 2 created_at 318969 non-null object \n", - " 3 updated_at 318969 non-null object \n", - " 4 purchase_id 318969 non-null int64 \n", - " 5 product_id 318969 non-null int64 \n", - " 6 is_from_subscription 318969 non-null bool \n", - " 7 type_of 318969 non-null int64 \n", - " 8 supplier_id 318969 non-null int64 \n", - " 9 barcode 0 non-null float64\n", - " 10 identifier 318969 non-null object \n", - "dtypes: bool(1), float64(1), int64(5), object(4)\n", - "memory usage: 24.6+ MB\n" - ] - } - ], - "source": [ - "# tickets\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tickets = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tickets.columns)\n", - "print(tickets.shape)\n", - "tickets.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "f54830cb-1f95-4f71-9b04-358c745fb454", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
021190811433_136_212_683562023-09-12 17:42:45.396336+02:002023-09-12 17:42:45.396336+02:00861764209879False11702NaNf694c255855ce5643c6fcc7fed5e9237
121190821433_136_194_683562023-09-12 17:42:45.409056+02:002023-09-12 17:42:45.409056+02:00861763209879False11702NaN838d6101db2fc8bc80536d8b91b49859
2211908333158_158_343_683572023-09-12 17:42:45.409824+02:002023-09-12 17:42:45.409824+02:00861769209880False11702NaN8a8d938d66a4dc57bcb44c2773c6fdfa
3211908433158_158_297_683572023-09-12 17:42:45.410447+02:002023-09-12 17:42:45.410447+02:00861767209880False11702NaNb7a3dd0794c0957c942d45b8913e5b96
4211908533158_158_318_683572023-09-12 17:42:45.411059+02:002023-09-12 17:42:45.411059+02:00861768209880False11702NaNd7ea7e443581ebe520dd13f6cad31af7
....................................
318964256402144247_204_239_892782023-09-12 18:59:48.750953+02:002023-09-12 18:59:48.750953+02:001244281210158False11702NaN82c9af8b2167f7ac34a5e834242b0239
318965256402244247_204_299_892782023-09-12 18:59:48.751441+02:002023-09-12 18:59:48.751441+02:001244284210158False11702NaN235e8e608f066cb72949bbd397d0a76f
318966256402344247_204_259_892782023-09-12 18:59:48.751924+02:002023-09-12 18:59:48.751924+02:001244282210158False11702NaNec22fa828931f030f7e79a4cc5478c4b
318967256402444247_204_279_892782023-09-12 18:59:48.752425+02:002023-09-12 18:59:48.752425+02:001244283210158False11702NaN31ec4deaf718e04caf193e1ff8d621ef
31896825131564854_178_2847_891702023-09-12 18:52:20.331807+02:002023-09-12 18:59:48.752904+02:001244285261922False31702NaN48aef9efab29bfb1537656908863bcc1
\n", - "

318969 rows × 11 columns

\n", - "
" - ], - "text/plain": [ - " id number created_at \\\n", - "0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n", - "1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n", - "2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n", - "3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n", - "4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n", - "... ... ... ... \n", - "318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n", - "318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n", - "318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n", - "318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n", - "318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n", - "\n", - " updated_at purchase_id product_id \\\n", - "0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n", - "1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n", - "2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n", - "3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n", - "4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n", - "... ... ... ... \n", - "318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n", - "318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n", - "318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n", - "318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n", - "318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n", - "\n", - " is_from_subscription type_of supplier_id barcode \\\n", - "0 False 1 1702 NaN \n", - "1 False 1 1702 NaN \n", - "2 False 1 1702 NaN \n", - "3 False 1 1702 NaN \n", - "4 False 1 1702 NaN \n", - "... ... ... ... ... \n", - "318964 False 1 1702 NaN \n", - "318965 False 1 1702 NaN \n", - "318966 False 1 1702 NaN \n", - "318967 False 1 1702 NaN \n", - "318968 False 3 1702 NaN \n", - "\n", - " identifier \n", - "0 f694c255855ce5643c6fcc7fed5e9237 \n", - "1 838d6101db2fc8bc80536d8b91b49859 \n", - "2 8a8d938d66a4dc57bcb44c2773c6fdfa \n", - "3 b7a3dd0794c0957c942d45b8913e5b96 \n", - "4 d7ea7e443581ebe520dd13f6cad31af7 \n", - "... ... \n", - "318964 82c9af8b2167f7ac34a5e834242b0239 \n", - "318965 235e8e608f066cb72949bbd397d0a76f \n", - "318966 ec22fa828931f030f7e79a4cc5478c4b \n", - "318967 31ec4deaf718e04caf193e1ff8d621ef \n", - "318968 48aef9efab29bfb1537656908863bcc1 \n", - "\n", - "[318969 rows x 11 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tickets" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "ad743347-33d1-41f0-852d-f9e6354f82ed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 3, 0])" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tickets['type_of'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "b88808fe-3b4e-49ed-9885-d52910b6f211", - "metadata": {}, - "source": [ - "## Types d'évenement et client" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n", - " 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n", - " 'facility_key_id', 'identifier'],\n", - " dtype='object')\n", - "(403, 12)\n", - "\n", - "RangeIndex: 403 entries, 0 to 402\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 403 non-null int64 \n", - " 1 created_at 403 non-null object\n", - " 2 updated_at 403 non-null object\n", - " 3 season_id 403 non-null int64 \n", - " 4 facility_id 403 non-null int64 \n", - " 5 name 403 non-null object\n", - " 6 event_type_id 403 non-null int64 \n", - " 7 manual_added 403 non-null bool \n", - " 8 is_display 403 non-null bool \n", - " 9 event_type_key_id 403 non-null int64 \n", - " 10 facility_key_id 403 non-null int64 \n", - " 11 identifier 403 non-null object\n", - "dtypes: bool(2), int64(6), object(4)\n", - "memory usage: 32.4+ KB\n" - ] - } - ], - "source": [ - "# Evenement = events.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " events = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(events.columns)\n", - "print(events.shape)\n", - "events.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "19706610-9e90-4e6f-8bd0-da124b87cff7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idcreated_atupdated_atseason_idfacility_idnameevent_type_idmanual_addedis_displayevent_type_key_idfacility_key_ididentifier
0203672023-09-13 03:42:45.214293+02:002023-09-13 03:54:30.086969+02:0018651054marelle1055FalseTrue1055105426d1e9a4acad18b9cf79244334c86c93
1203712023-09-13 03:42:45.218728+02:002023-09-13 03:54:30.103943+02:0018651054dialogues1055FalseTrue1055105460356fc5e8ed6c9c1be9c5ec67e77766
2205702023-10-05 04:48:29.374504+02:002023-10-05 04:48:36.562528+02:0018651054les grandes epopees1055FalseTrue10551054f8ab088e06252bf34e1b12ad2ce1a403
3207572023-11-01 03:55:20.846196+01:002023-11-01 03:55:28.412457+01:0018651054scolaire marelle1055FalseTrue10551054447fa80f9a793b7587bb85ebbda6442c
4203642023-09-13 03:42:45.196791+02:002023-09-13 03:54:30.075456+02:0018651054le couronnement de poppee1055FalseTrue105510543b37f5d2cd354cbc422868621ac7ebc2
.......................................
398156032023-09-12 17:42:25.327618+02:002023-09-12 19:00:00.893400+02:0017061054marelle1055FalseTrue10551054fde88b72fb82b1fe42fbbfbfc3d6b4d3
399156212023-09-12 17:42:25.335792+02:002023-09-12 19:00:00.899622+02:0017081054cartes d'adhesion1055FalseTrue10551054051b96aad2b720bad4450a59ed7dfbf6
400157402023-09-12 17:47:05.112101+02:002023-09-12 19:00:00.906123+02:0017111054repetition le medecin malgre lui1055FalseTrue10551054addd6885bea5ddf60ec3539dfc3e79e8
401155202023-09-12 17:42:25.290280+02:002023-09-12 19:00:00.835625+02:0017081054opera au village1055FalseTrue1055105494f250d10d4a56358ceab23b384439ff
402154392023-09-12 17:42:25.252747+02:002023-09-12 19:00:00.735990+02:0017081054florilege1055FalseTrue105510544f015946bcbd856aa573cadb7ac42b9f
\n", - "

403 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " id created_at \\\n", - "0 20367 2023-09-13 03:42:45.214293+02:00 \n", - "1 20371 2023-09-13 03:42:45.218728+02:00 \n", - "2 20570 2023-10-05 04:48:29.374504+02:00 \n", - "3 20757 2023-11-01 03:55:20.846196+01:00 \n", - "4 20364 2023-09-13 03:42:45.196791+02:00 \n", - ".. ... ... \n", - "398 15603 2023-09-12 17:42:25.327618+02:00 \n", - "399 15621 2023-09-12 17:42:25.335792+02:00 \n", - "400 15740 2023-09-12 17:47:05.112101+02:00 \n", - "401 15520 2023-09-12 17:42:25.290280+02:00 \n", - "402 15439 2023-09-12 17:42:25.252747+02:00 \n", - "\n", - " updated_at season_id facility_id \\\n", - "0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n", - "1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n", - "2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n", - "3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n", - "4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n", - ".. ... ... ... \n", - "398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n", - "399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n", - "400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n", - "401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n", - "402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n", - "\n", - " name event_type_id manual_added \\\n", - "0 marelle 1055 False \n", - "1 dialogues 1055 False \n", - "2 les grandes epopees 1055 False \n", - "3 scolaire marelle 1055 False \n", - "4 le couronnement de poppee 1055 False \n", - ".. ... ... ... \n", - "398 marelle 1055 False \n", - "399 cartes d'adhesion 1055 False \n", - "400 repetition le medecin malgre lui 1055 False \n", - "401 opera au village 1055 False \n", - "402 florilege 1055 False \n", - "\n", - " is_display event_type_key_id facility_key_id \\\n", - "0 True 1055 1054 \n", - "1 True 1055 1054 \n", - "2 True 1055 1054 \n", - "3 True 1055 1054 \n", - "4 True 1055 1054 \n", - ".. ... ... ... \n", - "398 True 1055 1054 \n", - "399 True 1055 1054 \n", - "400 True 1055 1054 \n", - "401 True 1055 1054 \n", - "402 True 1055 1054 \n", - "\n", - " identifier \n", - "0 26d1e9a4acad18b9cf79244334c86c93 \n", - "1 60356fc5e8ed6c9c1be9c5ec67e77766 \n", - "2 f8ab088e06252bf34e1b12ad2ce1a403 \n", - "3 447fa80f9a793b7587bb85ebbda6442c \n", - "4 3b37f5d2cd354cbc422868621ac7ebc2 \n", - ".. ... \n", - "398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n", - "399 051b96aad2b720bad4450a59ed7dfbf6 \n", - "400 addd6885bea5ddf60ec3539dfc3e79e8 \n", - "401 94f250d10d4a56358ceab23b384439ff \n", - "402 4f015946bcbd856aa573cadb7ac42b9f \n", - "\n", - "[403 rows x 12 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "6cb04679-26e7-4ed8-bfc1-42285da96374", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "357" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events['name'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n", - " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n", - " 'is_display', 'representation_type_id', 'expected_filling',\n", - " 'max_filling', 'extra_field', 'identifier'],\n", - " dtype='object')\n", - "(996, 16)\n", - "\n", - "RangeIndex: 996 entries, 0 to 995\n", - "Data columns (total 16 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 996 non-null int64 \n", - " 1 serial 0 non-null float64\n", - " 2 event_id 996 non-null int64 \n", - " 3 created_at 996 non-null object \n", - " 4 updated_at 996 non-null object \n", - " 5 start_date_time 996 non-null object \n", - " 6 open 996 non-null bool \n", - " 7 satisfaction 0 non-null float64\n", - " 8 end_date_time 996 non-null object \n", - " 9 name 0 non-null float64\n", - " 10 is_display 996 non-null bool \n", - " 11 representation_type_id 0 non-null float64\n", - " 12 expected_filling 24 non-null float64\n", - " 13 max_filling 24 non-null float64\n", - " 14 extra_field 0 non-null float64\n", - " 15 identifier 996 non-null object \n", - "dtypes: bool(2), float64(7), int64(2), object(5)\n", - "memory usage: 111.0+ KB\n" - ] - } - ], - "source": [ - "# Représentation des évenements = representations.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " representations = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(representations.columns)\n", - "print(representations.shape)\n", - "representations.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idserialevent_idcreated_atupdated_atstart_date_timeopensatisfactionend_date_timenameis_displayrepresentation_type_idexpected_fillingmax_fillingextra_fieldidentifier
044351NaN203712023-09-13 03:42:45.245879+02:002023-09-13 03:42:45.245879+02:002023-12-21 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaN550.0550.0NaN33520762e8cc28982e3841cbc2be8ce2
145497NaN207572023-11-01 03:55:20.875712+01:002023-11-01 03:55:20.875712+01:002023-11-28 10:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN5c34b84e3d11276e0995d984c94cd28d
244383NaN203832023-09-13 10:41:08.964302+02:002023-09-13 10:41:08.964302+02:002023-06-04 17:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNbf3c65a1dfefbd747dcc2360e6887eac
344384NaN203832023-09-13 10:41:08.972401+02:002023-09-13 10:41:08.972401+02:002023-06-03 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNb0e69ae8b78ebab3066aac83de22d239
444385NaN203842023-09-13 10:41:08.973290+02:002023-09-13 10:41:08.973290+02:002023-06-03 16:15:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9fb91c8b1cf9e444111c511e212ac5c1
...................................................
99133894NaN156472023-09-12 17:42:25.564297+02:002023-09-12 17:42:25.564297+02:002022-11-08 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN44bbcecfd007ceaad05805391beccabb
99233873NaN156402023-09-12 17:42:25.554863+02:002023-09-12 17:42:25.554863+02:002022-11-14 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN151edbec8e0a3cd80071038e857f3493
99333610NaN155202023-09-12 17:42:25.442979+02:002023-09-12 17:42:25.442979+02:002023-06-19 18:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9e9e38d527427e1b6f67e0c3f12b82fc
99433953NaN155202023-09-12 17:42:25.590746+02:002023-09-12 17:42:25.590746+02:002023-06-19 20:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN7bf0978aabb6cac1bb4cd2784afb2b6b
99533639NaN155332023-09-12 17:42:25.455708+02:002023-09-12 17:42:25.455708+02:002023-04-15 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNfae68f1e09710ec8747957af6e22f61d
\n", - "

996 rows × 16 columns

\n", - "
" - ], - "text/plain": [ - " id serial event_id created_at \\\n", - "0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n", - "1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n", - "2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n", - "3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n", - "4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n", - ".. ... ... ... ... \n", - "991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n", - "992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n", - "993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n", - "994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n", - "995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n", - "\n", - " updated_at start_date_time open \\\n", - "0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n", - "1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n", - "2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n", - "3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n", - "4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n", - ".. ... ... ... \n", - "991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n", - "992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n", - "993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n", - "994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n", - "995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n", - "\n", - " satisfaction end_date_time name is_display \\\n", - "0 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "1 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "2 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "3 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "4 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - ".. ... ... ... ... \n", - "991 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "992 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "993 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "994 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "995 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "\n", - " representation_type_id expected_filling max_filling extra_field \\\n", - "0 NaN 550.0 550.0 NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - ".. ... ... ... ... \n", - "991 NaN NaN NaN NaN \n", - "992 NaN NaN NaN NaN \n", - "993 NaN NaN NaN NaN \n", - "994 NaN NaN NaN NaN \n", - "995 NaN NaN NaN NaN \n", - "\n", - " identifier \n", - "0 33520762e8cc28982e3841cbc2be8ce2 \n", - "1 5c34b84e3d11276e0995d984c94cd28d \n", - "2 bf3c65a1dfefbd747dcc2360e6887eac \n", - "3 b0e69ae8b78ebab3066aac83de22d239 \n", - "4 9fb91c8b1cf9e444111c511e212ac5c1 \n", - ".. ... \n", - "991 44bbcecfd007ceaad05805391beccabb \n", - "992 151edbec8e0a3cd80071038e857f3493 \n", - "993 9e9e38d527427e1b6f67e0c3f12b82fc \n", - "994 7bf0978aabb6cac1bb4cd2784afb2b6b \n", - "995 fae68f1e09710ec8747957af6e22f61d \n", - "\n", - "[996 rows x 16 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "representations" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'amount', 'is_full_price', 'representation_id',\n", - " 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n", - " 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n", - " 'amount_consumption', 'identifier'],\n", - " dtype='object')\n", - "(14648, 14)\n", - "\n", - "RangeIndex: 14648 entries, 0 to 14647\n", - "Data columns (total 14 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 14648 non-null int64 \n", - " 1 amount 14648 non-null float64\n", - " 2 is_full_price 14648 non-null bool \n", - " 3 representation_id 14648 non-null int64 \n", - " 4 pricing_formula_id 14648 non-null int64 \n", - " 5 created_at 14648 non-null object \n", - " 6 updated_at 14648 non-null object \n", - " 7 category_id 14648 non-null int64 \n", - " 8 apply_price 14648 non-null float64\n", - " 9 products_group_id 14648 non-null int64 \n", - " 10 product_pack_id 14648 non-null int64 \n", - " 11 extra_field 0 non-null float64\n", - " 12 amount_consumption 0 non-null float64\n", - " 13 identifier 14648 non-null object \n", - "dtypes: bool(1), float64(4), int64(6), object(3)\n", - "memory usage: 1.5+ MB\n" - ] - } - ], - "source": [ - "# Produits vendues = products.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " products = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(products.columns)\n", - "print(products.shape)\n", - "products.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "34f1825d-148a-4a6e-88d6-61449fee3ee4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idamountis_full_pricerepresentation_idpricing_formula_idcreated_atupdated_atcategory_idapply_priceproducts_group_idproduct_pack_idextra_fieldamount_consumptionidentifier
026832518.0False44332204772023-09-13 03:42:45.415594+02:002023-09-13 03:42:45.415594+02:0049720.02681081NaNNaNb823bbea3ba837da2ef8efaf1287272d
127411836.8False44340205022023-10-25 03:26:57.430694+02:002023-10-25 03:26:57.430694+02:0049690.02739011NaNNaN81e8b7991f6948e3ef7cfe5011d13532
226833839.1False44340204972023-09-13 03:42:45.430942+02:002023-09-13 03:42:45.430942+02:0049690.02681211NaNNaNbe8bc0399db4d04aefa9f44afd4d5efa
32098830.0False33443204752023-09-12 17:42:27.595998+02:002023-09-12 17:42:27.595998+02:0049700.02097061NaNNaN01a9eea5f8ad53491faa864bfac44183
426832663.0False44333204772023-09-13 03:42:45.417283+02:002023-09-13 03:42:45.417283+02:0049690.02681091NaNNaN781a917ecfdabb14169701d7b143bbe4
.............................................
1464321787833.6False33919204892023-09-12 17:51:11.572882+02:002023-09-12 17:51:11.572882+02:0049710.02176951NaNNaN82bba69321466069411b3023343b44a4
1464426831510.0False33919205042023-09-12 18:59:29.995176+02:002023-09-12 18:59:29.995176+02:0049690.02680981NaNNaNeae56a8eb0a4315c5713b2053103d595
146452101485.0False33531204732023-09-12 17:42:27.733260+02:002023-09-12 17:42:27.733260+02:0049750.02099711NaNNaN449f86c1ef2b478d3389f7d0e27d0e6b
1464621205430.0False33810204732023-09-12 17:42:28.724681+02:002023-09-12 17:42:28.724681+02:0049720.02118761NaNNaN2090203e2c0b58ea8f505089faee6d62
1464726192221.0False33766204882023-09-12 18:52:00.519838+02:002023-09-12 18:52:00.519838+02:0049720.02617091NaNNaN9139ee36a92bed766ae95372cca77336
\n", - "

14648 rows × 14 columns

\n", - "
" - ], - "text/plain": [ - " id amount is_full_price representation_id pricing_formula_id \\\n", - "0 268325 18.0 False 44332 20477 \n", - "1 274118 36.8 False 44340 20502 \n", - "2 268338 39.1 False 44340 20497 \n", - "3 209883 0.0 False 33443 20475 \n", - "4 268326 63.0 False 44333 20477 \n", - "... ... ... ... ... ... \n", - "14643 217878 33.6 False 33919 20489 \n", - "14644 268315 10.0 False 33919 20504 \n", - "14645 210148 5.0 False 33531 20473 \n", - "14646 212054 30.0 False 33810 20473 \n", - "14647 261922 21.0 False 33766 20488 \n", - "\n", - " created_at updated_at \\\n", - "0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n", - "1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n", - "2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n", - "3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n", - "4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n", - "... ... ... \n", - "14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n", - "14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n", - "14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n", - "14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n", - "14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n", - "\n", - " category_id apply_price products_group_id product_pack_id \\\n", - "0 4972 0.0 268108 1 \n", - "1 4969 0.0 273901 1 \n", - "2 4969 0.0 268121 1 \n", - "3 4970 0.0 209706 1 \n", - "4 4969 0.0 268109 1 \n", - "... ... ... ... ... \n", - "14643 4971 0.0 217695 1 \n", - "14644 4969 0.0 268098 1 \n", - "14645 4975 0.0 209971 1 \n", - "14646 4972 0.0 211876 1 \n", - "14647 4972 0.0 261709 1 \n", - "\n", - " extra_field amount_consumption identifier \n", - "0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n", - "1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n", - "2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n", - "3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n", - "4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n", - "... ... ... ... \n", - "14643 NaN NaN 82bba69321466069411b3023343b44a4 \n", - "14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n", - "14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n", - "14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n", - "14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n", - "\n", - "[14648 rows x 14 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "products" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "6735b338-26b5-479d-825d-677ea533dad5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(1, 7)\n", - "\n", - "RangeIndex: 1 entries, 0 to 0\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1 non-null int64 \n", - " 1 name 0 non-null float64\n", - " 2 created_at 1 non-null object \n", - " 3 updated_at 1 non-null object \n", - " 4 street_id 1 non-null int64 \n", - " 5 fixed_capacity 0 non-null float64\n", - " 6 identifier 1 non-null object \n", - "dtypes: float64(2), int64(2), object(3)\n", - "memory usage: 184.0+ bytes\n" - ] - } - ], - "source": [ - "# Lieu = facilities.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " facilities = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(facilities.columns)\n", - "print(facilities.shape)\n", - "facilities.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamecreated_atupdated_atstreet_idfixed_capacityidentifier
01054NaN2023-09-12 17:42:25.223064+02:002023-09-12 17:42:25.223064+02:001NaNd41d8cd98f00b204e9800998ecf8427e
\n", - "
" - ], - "text/plain": [ - " id name created_at \\\n", - "0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n", - "\n", - " updated_at street_id fixed_capacity \\\n", - "0 2023-09-12 17:42:25.223064+02:00 1 NaN \n", - "\n", - " identifier \n", - "0 d41d8cd98f00b204e9800998ecf8427e " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "facilities" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(9, 6)\n", - "\n", - "RangeIndex: 9 entries, 0 to 8\n", - "Data columns (total 6 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 9 non-null int64 \n", - " 1 name 9 non-null object \n", - " 2 created_at 9 non-null object \n", - " 3 updated_at 9 non-null object \n", - " 4 start_date_time 0 non-null float64\n", - " 5 identifier 9 non-null object \n", - "dtypes: float64(1), int64(1), object(4)\n", - "memory usage: 560.0+ bytes\n" - ] - } - ], - "source": [ - "# Saisons = seasons.csv période sur deux années consécutives\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " seasons = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(seasons.columns)\n", - "print(seasons.shape)\n", - "seasons.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n", - " 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n", - " 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n", - " dtype=object)" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seasons['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n", - " 'number', 'identifier'],\n", - " dtype='object')\n", - "(410695, 7)\n", - "\n", - "RangeIndex: 410695 entries, 0 to 410694\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 410695 non-null int64 \n", - " 1 purchase_date 410695 non-null object \n", - " 2 customer_id 410695 non-null int64 \n", - " 3 created_at 410695 non-null object \n", - " 4 updated_at 410695 non-null object \n", - " 5 number 0 non-null float64\n", - " 6 identifier 410695 non-null object \n", - "dtypes: float64(1), int64(2), object(4)\n", - "memory usage: 21.9+ MB\n" - ] - } - ], - "source": [ - "# Achats = purchases.csv \n", - "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " purchases = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(purchases.columns)\n", - "print(purchases.shape)\n", - "purchases.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c", - "metadata": {}, "outputs": [ { "data": { @@ -3612,53 +1179,53 @@ " \n", " \n", " 0\n", - " 861761\n", - " 2019-03-01 16:28:49+01:00\n", - " 4966\n", - " 2023-09-12 17:42:37.564150+02:00\n", - " 2023-09-12 17:42:37.564150+02:00\n", - " NaN\n", - " d20eb0c3a7efec0bbe338dee40dc3378\n", + " 5145662\n", + " 2019-07-17 11:17:53+02:00\n", + " 6632\n", + " 2021-12-28 20:48:51.569237+01:00\n", + " 2021-12-28 20:48:51.569237+01:00\n", + " fa80c83b29a268b45728c910a8afcf79\n", + " 82877c41df26f832eb823a83acd1a172\n", " \n", " \n", " 1\n", - " 861762\n", - " 2019-03-01 16:29:11+01:00\n", - " 4966\n", - " 2023-09-12 17:42:37.571159+02:00\n", - " 2023-09-12 17:42:37.571159+02:00\n", - " NaN\n", - " cff3abfc018517bce5ccfc58f5cacf40\n", + " 4941642\n", + " 2018-10-31 11:59:00+01:00\n", + " 1\n", + " 2021-12-28 20:31:48.196681+01:00\n", + " 2022-03-03 17:52:21.958861+01:00\n", + " 597b6c06adfe6acc539b29b657b80da0\n", + " e7102ebe65526c427245533ebabe66e5\n", " \n", " \n", " 2\n", - " 861763\n", - " 2019-03-01 16:29:17+01:00\n", - " 4966\n", - " 2023-09-12 17:42:37.571646+02:00\n", - " 2023-09-12 17:42:37.571646+02:00\n", - " NaN\n", - " e1155cf26b34f792bdb23e49244d7264\n", + " 5088860\n", + " 2018-10-31 12:45:12+01:00\n", + " 1\n", + " 2021-12-28 20:46:34.703542+01:00\n", + " 2021-12-28 20:46:34.703542+01:00\n", + " 4a7f6baaf9be6a99e3fead7f7e981fa8\n", + " af75c4ae53d1b6957875538355b162e1\n", " \n", " \n", " 3\n", - " 861764\n", - " 2019-03-01 16:29:19+01:00\n", - " 4966\n", - " 2023-09-12 17:42:37.572063+02:00\n", - " 2023-09-12 17:42:37.572063+02:00\n", - " NaN\n", - " e8b95cc6a1a8b103ffa39755ce3bfc4d\n", + " 5088862\n", + " 2018-10-31 13:07:12+01:00\n", + " 1\n", + " 2021-12-28 20:46:34.704773+01:00\n", + " 2021-12-28 20:46:34.704773+01:00\n", + " 1d83dfad44b73070d1c6d5875d0edd2d\n", + " 4b2fe34659b177209b07270ae1043b40\n", " \n", " \n", " 4\n", - " 861765\n", - " 2019-03-01 16:32:08+01:00\n", - " 405994\n", - " 2023-09-12 17:42:37.572470+02:00\n", - " 2023-09-12 17:42:37.572470+02:00\n", - " NaN\n", - " 1b763278914f1309e357abe5033a3f0f\n", + " 5088863\n", + " 2018-10-31 13:08:50+01:00\n", + " 1\n", + " 2021-12-28 20:46:34.705453+01:00\n", + " 2021-12-28 20:46:34.705453+01:00\n", + " 7bfe2bc9c1670c973d0960e3fd408cf8\n", + " b115f04a99b94df9e4a32185844f0998\n", " \n", " \n", " ...\n", @@ -3671,111 +1238,448 @@ " ...\n", " \n", " \n", - " 410690\n", - " 1285964\n", - " 2023-10-21 21:46:41+02:00\n", - " 517309\n", - " 2023-10-23 03:43:16.457501+02:00\n", - " 2023-10-23 03:43:16.457501+02:00\n", - " NaN\n", - " 72c4e90c2b151dcffc87b19ea8a0c4f1\n", + " 742245\n", + " 8007695\n", + " 2023-11-08 17:51:19+01:00\n", + " 1256133\n", + " 2023-11-09 07:51:33.920187+01:00\n", + " 2023-11-09 07:51:33.920187+01:00\n", + " 99ad774dedbad43feb73514765d2f0ba\n", + " d68558180b4bf2e8a945724843655775\n", " \n", " \n", - " 410691\n", - " 1285965\n", - " 2023-10-21 21:47:07+02:00\n", - " 517309\n", - " 2023-10-23 03:43:16.458458+02:00\n", - " 2023-10-23 03:43:16.458458+02:00\n", - " NaN\n", - " ee65532087132145daa6154fbae050ea\n", + " 742246\n", + " 8007696\n", + " 2023-11-08 18:17:51+01:00\n", + " 1256134\n", + " 2023-11-09 07:51:33.921967+01:00\n", + " 2023-11-09 07:51:33.921967+01:00\n", + " c1511614c511c5f95980172690179102\n", + " f5102d910a7731091f239ad7b0df35b4\n", " \n", " \n", - " 410692\n", - " 1285966\n", - " 2023-10-21 21:47:20+02:00\n", - " 517309\n", - " 2023-10-23 03:43:16.458811+02:00\n", - " 2023-10-23 03:43:16.458811+02:00\n", - " NaN\n", - " 7e825dd352bc6a11ab81cb8068e325e6\n", + " 742247\n", + " 8007697\n", + " 2023-11-08 18:23:54+01:00\n", + " 1256135\n", + " 2023-11-09 07:51:33.923034+01:00\n", + " 2023-11-09 07:51:33.923034+01:00\n", + " 33b64b39cc53428b4f17d65ff5b93104\n", + " e2b917626be60cc2c3207cc037fe69e4\n", " \n", " \n", - " 410693\n", - " 1285967\n", - " 2023-10-21 23:07:06+02:00\n", - " 399969\n", - " 2023-10-23 03:43:16.459738+02:00\n", - " 2023-10-23 03:43:16.459738+02:00\n", - " NaN\n", - " fdb92627a48d6ba8fa817d60a83dbea8\n", + " 742248\n", + " 8007698\n", + " 2023-11-08 19:32:18+01:00\n", + " 1256136\n", + " 2023-11-09 07:51:33.924135+01:00\n", + " 2023-11-09 07:51:33.924135+01:00\n", + " 9ae0b129e704b3d9c093ce9c7c4e5039\n", + " 5bfa23236c31f8562c3a0233c1b53b31\n", " \n", " \n", - " 410694\n", - " 1285968\n", - " 2023-10-21 23:07:39+02:00\n", - " 399969\n", - " 2023-10-23 03:43:16.462409+02:00\n", - " 2023-10-23 03:43:16.462409+02:00\n", - " NaN\n", - " e9dbaff4f7037a5b0efa11263584dfad\n", + " 742249\n", + " 8007699\n", + " 2023-11-08 20:30:28+01:00\n", + " 1256137\n", + " 2023-11-09 07:51:33.925382+01:00\n", + " 2023-11-09 07:51:33.925382+01:00\n", + " d31ced089c2b1f90479257a4686f9306\n", + " d86b1e0de3ff01eaf04fbcd031ac5fef\n", " \n", " \n", "\n", - "

410695 rows × 7 columns

\n", + "

742250 rows × 7 columns

\n", "" ], "text/plain": [ " id purchase_date customer_id \\\n", - "0 861761 2019-03-01 16:28:49+01:00 4966 \n", - "1 861762 2019-03-01 16:29:11+01:00 4966 \n", - "2 861763 2019-03-01 16:29:17+01:00 4966 \n", - "3 861764 2019-03-01 16:29:19+01:00 4966 \n", - "4 861765 2019-03-01 16:32:08+01:00 405994 \n", + "0 5145662 2019-07-17 11:17:53+02:00 6632 \n", + "1 4941642 2018-10-31 11:59:00+01:00 1 \n", + "2 5088860 2018-10-31 12:45:12+01:00 1 \n", + "3 5088862 2018-10-31 13:07:12+01:00 1 \n", + "4 5088863 2018-10-31 13:08:50+01:00 1 \n", "... ... ... ... \n", - "410690 1285964 2023-10-21 21:46:41+02:00 517309 \n", - "410691 1285965 2023-10-21 21:47:07+02:00 517309 \n", - "410692 1285966 2023-10-21 21:47:20+02:00 517309 \n", - "410693 1285967 2023-10-21 23:07:06+02:00 399969 \n", - "410694 1285968 2023-10-21 23:07:39+02:00 399969 \n", + "742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n", + "742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n", + "742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n", + "742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n", + "742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n", "\n", " created_at updated_at \\\n", - "0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n", - "1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n", - "2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n", - "3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n", - "4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n", + "0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n", + "1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n", + "2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n", + "3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n", + "4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n", "... ... ... \n", - "410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n", - "410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n", - "410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n", - "410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n", - "410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n", + "742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n", + "742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n", + "742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n", + "742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n", + "742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n", "\n", - " number identifier \n", - "0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n", - "1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n", - "2 NaN e1155cf26b34f792bdb23e49244d7264 \n", - "3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n", - "4 NaN 1b763278914f1309e357abe5033a3f0f \n", - "... ... ... \n", - "410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n", - "410691 NaN ee65532087132145daa6154fbae050ea \n", - "410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n", - "410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n", - "410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n", + " number identifier \n", + "0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n", + "1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n", + "2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n", + "3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n", + "4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n", + "... ... ... \n", + "742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n", + "742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n", + "742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n", + "742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n", + "742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n", "\n", - "[410695 rows x 7 columns]" + "[742250 rows x 7 columns]" ] }, - "execution_count": 28, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "purchases" + "df1_purchases" ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 742250 entries, 0 to 742249\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 742250 non-null int64 \n", + " 1 purchase_date 742250 non-null object\n", + " 2 customer_id 742250 non-null int64 \n", + " 3 created_at 742250 non-null object\n", + " 4 updated_at 742250 non-null object\n", + " 5 number 742250 non-null object\n", + " 6 identifier 742250 non-null object\n", + "dtypes: int64(2), object(5)\n", + "memory usage: 39.6+ MB\n" + ] + } + ], + "source": [ + "df1_purchases.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Nettoyage purchase_date\n", + "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n", + "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "27d18584-228f-4698-85d6-4d23151ea5ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 742250 entries, 0 to 742249\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 742250 non-null int64 \n", + " 1 purchase_date 742250 non-null datetime64[ns, UTC]\n", + " 2 customer_id 742250 non-null int64 \n", + " 3 created_at 742250 non-null object \n", + " 4 updated_at 742250 non-null object \n", + " 5 number 742250 non-null object \n", + " 6 identifier 742250 non-null object \n", + "dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n", + "memory usage: 39.6+ MB\n" + ] + } + ], + "source": [ + "df1_purchases.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Selection des variables\n", + "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a", + "metadata": {}, + "source": [ + "## Fusion de l'ensemble des données billétiques" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7", + "metadata": {}, + "outputs": [], + "source": [ + "# Fusion avec fournisseurs\n", + "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", + "\n", + "# Fusion avec type de tickets\n", + "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", + "\n", + "# Fusion avec achats\n", + "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
...........................
182666718643847350454FalsevadBillet en nombrepricing_formula2022-08-02 08:59:17+00:0041
182666819853111383564FalsevadBillet en nombrepricing_formula2022-11-04 14:25:42+00:0062763
182666919860514383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667019860515383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667119860516383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
\n", + "

1826672 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id product_id is_from_subscription supplier_name \\\n", + "0 13070859 225251 False vente en ligne \n", + "1 13070860 224914 False vente en ligne \n", + "2 13070861 224914 False vente en ligne \n", + "3 13070862 224914 False vente en ligne \n", + "4 13070863 224914 False vente en ligne \n", + "... ... ... ... ... \n", + "1826667 18643847 350454 False vad \n", + "1826668 19853111 383564 False vad \n", + "1826669 19860514 383751 False vad \n", + "1826670 19860515 383751 False vad \n", + "1826671 19860516 383751 False vad \n", + "\n", + " type_of_ticket_name children purchase_date \\\n", + "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "... ... ... ... \n", + "1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n", + "1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n", + "1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "\n", + " customer_id \n", + "0 48187 \n", + "1 48187 \n", + "2 48187 \n", + "3 48187 \n", + "4 48187 \n", + "... ... \n", + "1826667 41 \n", + "1826668 62763 \n", + "1826669 1195566 \n", + "1826670 1195566 \n", + "1826671 1195566 \n", + "\n", + "[1826672 rows x 8 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_ticket_information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b95464b1-26bc-4aac-84b4-45da83b92251", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {