{ "cells": [ { "cell_type": "markdown", "id": "56b3d44e-1e3f-4726-9916-0f9af107860e", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "15103481-8d74-404c-aa09-7601fe7730da", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re" ] }, { "cell_type": "markdown", "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "markdown", "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", "metadata": {}, "source": [ "# Exemple sur Company 1" ] }, { "cell_type": "markdown", "id": "9d74b68f-ba07-4a15-9a27-dae931762d70", "metadata": {}, "source": [ "## Chargement données" ] }, { "cell_type": "code", "execution_count": 3, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/1\"\n", "liste_database = fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n" ] } ], "source": [ "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n", "\n", "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", "\n", "# Afficher le résultat\n", "print(liste_database_filtered)" ] }, { "cell_type": "code", "execution_count": 5, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } ], "source": [ "# loop to create dataframes from file 2\n", "files_path = liste_database_filtered\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "markdown", "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f", "metadata": {}, "source": [ "## tickets.csv" ] }, { "cell_type": "code", "execution_count": 6, "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
013070859135930026612882021-12-28 20:47:10.320641+01:002022-02-14 18:46:53.614229+01:005107462225251False13NaNb6ad7fc36f33b5e05f58c7fca06688a6
113070860135930026613992021-12-28 20:47:10.321037+01:002022-02-14 18:46:53.614761+01:005107462224914False13NaNb0903af480266f27802fe5c38c277c9e
213070861135930026614192021-12-28 20:47:10.321629+01:002022-02-14 18:46:53.615521+01:005107462224914False13NaN64ca12b7e26a65b90335c0702ea0faba
313070862135930026615082021-12-28 20:47:10.322029+01:002022-02-14 18:46:53.616000+01:005107462224914False13NaN5ac2f8150aa9f3a6b1599df08cc2f0c7
413070863135930026616892021-12-28 20:47:10.322449+01:002022-02-14 18:46:53.616447+01:005107462224914False13NaNdfe30081bae020d12094279926136b9c
....................................
182666720662815135930161543902023-11-09 07:51:34.935983+01:002023-11-09 07:51:34.935983+01:008007697405689False13NaNdba9aa428f843b79ae69dfacfe8fc579
182666820662816135930161545012023-11-09 07:51:34.937038+01:002023-11-09 07:51:34.937038+01:008007698403658False13NaN93f1fcfc6ba4fa68f92eb4b4a619fcf0
182666920662817135930161546802023-11-09 07:51:34.938224+01:002023-11-09 07:51:34.938224+01:008007698403658False13NaNc8bbbd25df2c158767ceef42c3237f23
182667020662818135930161548992023-11-09 07:51:34.939328+01:002023-11-09 07:51:34.939328+01:008007699403658False13NaN738f0a8b5088b5056bc3b32eff2dca1f
182667120662819135930161549882023-11-09 07:51:34.940680+01:002023-11-09 07:51:34.940680+01:008007699403658False13NaN4c5a6195434377380b4e6ae63b2e9cf6
\n", "

1826672 rows × 11 columns

\n", "
" ], "text/plain": [ " id number created_at \\\n", "0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n", "1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n", "2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n", "3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n", "4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n", "... ... ... ... \n", "1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n", "1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n", "1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n", "1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n", "1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n", "\n", " updated_at purchase_id product_id \\\n", "0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n", "1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n", "2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n", "3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n", "4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n", "... ... ... ... \n", "1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n", "1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n", "1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n", "1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n", "1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n", "\n", " is_from_subscription type_of supplier_id barcode \\\n", "0 False 1 3 NaN \n", "1 False 1 3 NaN \n", "2 False 1 3 NaN \n", "3 False 1 3 NaN \n", "4 False 1 3 NaN \n", "... ... ... ... ... \n", "1826667 False 1 3 NaN \n", "1826668 False 1 3 NaN \n", "1826669 False 1 3 NaN \n", "1826670 False 1 3 NaN \n", "1826671 False 1 3 NaN \n", "\n", " identifier \n", "0 b6ad7fc36f33b5e05f58c7fca06688a6 \n", "1 b0903af480266f27802fe5c38c277c9e \n", "2 64ca12b7e26a65b90335c0702ea0faba \n", "3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n", "4 dfe30081bae020d12094279926136b9c \n", "... ... \n", "1826667 dba9aa428f843b79ae69dfacfe8fc579 \n", "1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n", "1826669 c8bbbd25df2c158767ceef42c3237f23 \n", "1826670 738f0a8b5088b5056bc3b32eff2dca1f \n", "1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n", "\n", "[1826672 rows x 11 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_tickets" ] }, { "cell_type": "code", "execution_count": 7, "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1826672 entries, 0 to 1826671\n", "Data columns (total 11 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 id int64 \n", " 1 number object \n", " 2 created_at object \n", " 3 updated_at object \n", " 4 purchase_id int64 \n", " 5 product_id int64 \n", " 6 is_from_subscription bool \n", " 7 type_of int64 \n", " 8 supplier_id int64 \n", " 9 barcode float64\n", " 10 identifier object \n", "dtypes: bool(1), float64(1), int64(5), object(4)\n", "memory usage: 141.1+ MB\n" ] } ], "source": [ "df1_tickets.info()" ] }, { "cell_type": "code", "execution_count": 8, "id": "c1b42769-03c7-4785-92ce-5e1e6b41908d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "number 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "purchase_id 0.0\n", "product_id 0.0\n", "is_from_subscription 0.0\n", "type_of 0.0\n", "supplier_id 0.0\n", "barcode 100.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_tickets.isna().sum()/len(df1_tickets)*100" ] }, { "cell_type": "code", "execution_count": 9, "id": "42896791-2d93-4725-a50b-6c7cbe535ec7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n" ] } ], "source": [ "# Selection des variables\n", "df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", "df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)" ] }, { "cell_type": "markdown", "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", "metadata": {}, "source": [ "## suppliers.csv" ] }, { "cell_type": "code", "execution_count": 10, "id": "2e0dada0-9457-484c-aa55-77e44613ecca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamemanually_addedlabelitrupdated_atcreated_atcommissionidentifier
01617j4 administrationFalseNaNNaN2021-07-29 09:21:37.325772+02:002021-07-29 09:21:37.325772+02:00NaN5958b2a060ac3e31678b438892a1bd2e
18non définiFalseNaNNaN2020-09-03 13:16:35.329062+02:002020-09-03 13:16:35.329062+02:00NaN52ff3466787b4d538407372e5f7afe0f
24vadFalseNaNNaN2020-09-03 13:11:23.896992+02:002020-09-03 13:11:23.896992+02:00NaN1225483c97b36018cab2bea14ab78ea6
31fort saint jeanFalseNaNNaN2020-09-03 13:11:23.833073+02:002020-09-03 13:11:23.833073+02:00NaN001b9b4a524fe407150b8235b304d4ec
42j4FalseNaNNaN2020-09-03 13:11:23.888993+02:002020-09-03 13:11:23.888993+02:00NaN6a0cf6edf20060344b465706b61719aa
55revendeurFalseNaNNaN2020-09-03 13:11:23.900987+02:002020-09-03 13:11:23.900987+02:00NaN931239d4acb6214d7e5c98edecfb4916
63vente en ligneFalseNaNNaN2020-09-03 13:11:23.893097+02:002020-09-03 13:11:23.893097+02:00NaNbde8f2ccff510df8572d3214d86b837d
76ccrFalseNaNNaN2020-09-03 13:11:23.904974+02:002020-09-03 13:11:23.904974+02:00NaNb48ec279411f7dbbb68393c61a9724d9
87dabFalseNaNNaN2020-09-03 13:11:23.908970+02:002020-09-03 13:11:23.908970+02:00NaN11c6d471fa4e354e62e684d293694202
\n", "
" ], "text/plain": [ " id name manually_added label itr \\\n", "0 1617 j4 administration False NaN NaN \n", "1 8 non défini False NaN NaN \n", "2 4 vad False NaN NaN \n", "3 1 fort saint jean False NaN NaN \n", "4 2 j4 False NaN NaN \n", "5 5 revendeur False NaN NaN \n", "6 3 vente en ligne False NaN NaN \n", "7 6 ccr False NaN NaN \n", "8 7 dab False NaN NaN \n", "\n", " updated_at created_at \\\n", "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n", "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n", "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n", "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n", "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n", "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n", "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n", "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n", "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n", "\n", " commission identifier \n", "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n", "1 NaN 52ff3466787b4d538407372e5f7afe0f \n", "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n", "3 NaN 001b9b4a524fe407150b8235b304d4ec \n", "4 NaN 6a0cf6edf20060344b465706b61719aa \n", "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n", "6 NaN bde8f2ccff510df8572d3214d86b837d \n", "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n", "8 NaN 11c6d471fa4e354e62e684d293694202 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_suppliers" ] }, { "cell_type": "code", "execution_count": 11, "id": "b583be02-ab60-4e14-9325-0204f203a1af", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 9 entries, 0 to 8\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 9 non-null int64 \n", " 1 name 9 non-null object \n", " 2 manually_added 9 non-null bool \n", " 3 label 0 non-null float64\n", " 4 itr 0 non-null float64\n", " 5 updated_at 9 non-null object \n", " 6 created_at 9 non-null object \n", " 7 commission 0 non-null float64\n", " 8 identifier 9 non-null object \n", "dtypes: bool(1), float64(3), int64(1), object(4)\n", "memory usage: 713.0+ bytes\n" ] } ], "source": [ "df1_suppliers.info()" ] }, { "cell_type": "code", "execution_count": 12, "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "name 0.0\n", "manually_added 0.0\n", "label 100.0\n", "itr 100.0\n", "updated_at 0.0\n", "created_at 0.0\n", "commission 100.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_suppliers.isna().sum()/len(df1_suppliers)*100" ] }, { "cell_type": "code", "execution_count": 13, "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n" ] } ], "source": [ "# Selection des variables\n", "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n", "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)" ] }, { "cell_type": "code", "execution_count": 14, "id": "4de7e2e2-6da4-4618-8444-b524399c5493", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsupplier_name
01617j4 administration
18non défini
24vad
31fort saint jean
42j4
55revendeur
63vente en ligne
76ccr
87dab
\n", "
" ], "text/plain": [ " id supplier_name\n", "0 1617 j4 administration\n", "1 8 non défini\n", "2 4 vad\n", "3 1 fort saint jean\n", "4 2 j4\n", "5 5 revendeur\n", "6 3 vente en ligne\n", "7 6 ccr\n", "8 7 dab" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_suppliers_clean" ] }, { "cell_type": "markdown", "id": "0a6df975-c7fc-45bc-92af-a0bdab17d795", "metadata": {}, "source": [ "## type_ofs.csv" ] }, { "cell_type": "code", "execution_count": 15, "id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamechildrencreated_atupdated_atidentifier
01Atelierpricing_formula2021-01-05 11:55:51.188106+01:002021-01-05 11:55:51.188106+01:00623ec4067827558b28972cf39fe81ee7
12Billet en nombrepricing_formula2021-01-11 12:13:19.286301+01:002021-01-11 12:13:19.286301+01:00a53d313a97296ee37caa066dbfe7a45c
23Groupepricing_formula2021-01-11 12:19:22.842917+01:002021-01-11 12:19:22.842917+01:001ab143efc3b85acbbc752fe8eb2b0b86
34Revendeurpricing_formula2021-01-12 12:34:20.481236+01:002021-01-12 12:34:20.481236+01:008b332723366a07e1eef5f1c92f9ae067
45Cinéma scolairepricing_formula2021-01-25 19:16:05.141719+01:002021-01-25 19:16:05.141719+01:00a12e62cb4c4f47e7406bd8fbff2bfe30
56Musée famillepricing_formula2021-01-25 19:23:06.692627+01:002021-01-25 19:23:06.692627+01:001ec6c19283111ccb3ed67f52d414470e
67Spectacle famillepricing_formula2021-01-25 19:28:21.390016+01:002021-01-25 19:28:21.390016+01:0005e2104f1b74ced229c06847d6e91938
78Masterclasspricing_formula2021-01-25 19:31:05.076904+01:002021-01-25 19:31:05.076904+01:009cc946edfb25e11b4282f58db16e6ae9
89Spectaclepricing_formula2021-01-25 19:38:41.260535+01:002021-01-25 19:38:41.260535+01:00d88321c347f0e0ab101184cdf25c94bf
910Cinemapricing_formula2021-02-05 11:12:31.932576+01:002021-02-05 11:12:31.932576+01:000870fef2bfcd5b30a12e4f5c7f4aaba7
1011Museepricing_formula2021-02-05 11:52:05.468207+01:002021-02-05 11:52:05.468207+01:008ba8934454cc62c7cdb3eb6e1b39df0c
1112Tarifs pleincategory2023-03-13 11:31:50.528331+01:002023-03-13 11:31:50.528331+01:00a6969df76efc15d157be48e87a7bcf9a
\n", "
" ], "text/plain": [ " id name children created_at \\\n", "0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n", "1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n", "2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n", "3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n", "4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n", "5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n", "6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n", "7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n", "8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n", "9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n", "10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n", "11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n", "\n", " updated_at identifier \n", "0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n", "1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n", "2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n", "3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n", "4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n", "5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n", "6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n", "7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n", "8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n", "9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n", "10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n", "11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_type_ofs" ] }, { "cell_type": "code", "execution_count": 16, "id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 12 entries, 0 to 11\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 12 non-null int64 \n", " 1 name 12 non-null object\n", " 2 children 12 non-null object\n", " 3 created_at 12 non-null object\n", " 4 updated_at 12 non-null object\n", " 5 identifier 12 non-null object\n", "dtypes: int64(1), object(5)\n", "memory usage: 704.0+ bytes\n" ] } ], "source": [ "df1_type_ofs.info()" ] }, { "cell_type": "code", "execution_count": 17, "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" ] } ], "source": [ "# Selection des variables\n", "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n", "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)" ] }, { "cell_type": "markdown", "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72", "metadata": {}, "source": [ "## purchases.csv" ] }, { "cell_type": "code", "execution_count": 18, "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpurchase_datecustomer_idcreated_atupdated_atnumberidentifier
051456622019-07-17 11:17:53+02:0066322021-12-28 20:48:51.569237+01:002021-12-28 20:48:51.569237+01:00fa80c83b29a268b45728c910a8afcf7982877c41df26f832eb823a83acd1a172
149416422018-10-31 11:59:00+01:0012021-12-28 20:31:48.196681+01:002022-03-03 17:52:21.958861+01:00597b6c06adfe6acc539b29b657b80da0e7102ebe65526c427245533ebabe66e5
250888602018-10-31 12:45:12+01:0012021-12-28 20:46:34.703542+01:002021-12-28 20:46:34.703542+01:004a7f6baaf9be6a99e3fead7f7e981fa8af75c4ae53d1b6957875538355b162e1
350888622018-10-31 13:07:12+01:0012021-12-28 20:46:34.704773+01:002021-12-28 20:46:34.704773+01:001d83dfad44b73070d1c6d5875d0edd2d4b2fe34659b177209b07270ae1043b40
450888632018-10-31 13:08:50+01:0012021-12-28 20:46:34.705453+01:002021-12-28 20:46:34.705453+01:007bfe2bc9c1670c973d0960e3fd408cf8b115f04a99b94df9e4a32185844f0998
........................
74224580076952023-11-08 17:51:19+01:0012561332023-11-09 07:51:33.920187+01:002023-11-09 07:51:33.920187+01:0099ad774dedbad43feb73514765d2f0bad68558180b4bf2e8a945724843655775
74224680076962023-11-08 18:17:51+01:0012561342023-11-09 07:51:33.921967+01:002023-11-09 07:51:33.921967+01:00c1511614c511c5f95980172690179102f5102d910a7731091f239ad7b0df35b4
74224780076972023-11-08 18:23:54+01:0012561352023-11-09 07:51:33.923034+01:002023-11-09 07:51:33.923034+01:0033b64b39cc53428b4f17d65ff5b93104e2b917626be60cc2c3207cc037fe69e4
74224880076982023-11-08 19:32:18+01:0012561362023-11-09 07:51:33.924135+01:002023-11-09 07:51:33.924135+01:009ae0b129e704b3d9c093ce9c7c4e50395bfa23236c31f8562c3a0233c1b53b31
74224980076992023-11-08 20:30:28+01:0012561372023-11-09 07:51:33.925382+01:002023-11-09 07:51:33.925382+01:00d31ced089c2b1f90479257a4686f9306d86b1e0de3ff01eaf04fbcd031ac5fef
\n", "

742250 rows × 7 columns

\n", "
" ], "text/plain": [ " id purchase_date customer_id \\\n", "0 5145662 2019-07-17 11:17:53+02:00 6632 \n", "1 4941642 2018-10-31 11:59:00+01:00 1 \n", "2 5088860 2018-10-31 12:45:12+01:00 1 \n", "3 5088862 2018-10-31 13:07:12+01:00 1 \n", "4 5088863 2018-10-31 13:08:50+01:00 1 \n", "... ... ... ... \n", "742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n", "742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n", "742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n", "742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n", "742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n", "\n", " created_at updated_at \\\n", "0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n", "1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n", "2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n", "3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n", "4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n", "... ... ... \n", "742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n", "742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n", "742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n", "742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n", "742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n", "\n", " number identifier \n", "0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n", "1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n", "2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n", "3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n", "4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n", "... ... ... \n", "742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n", "742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n", "742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n", "742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n", "742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n", "\n", "[742250 rows x 7 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_purchases" ] }, { "cell_type": "code", "execution_count": 19, "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 742250 entries, 0 to 742249\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 742250 non-null int64 \n", " 1 purchase_date 742250 non-null object\n", " 2 customer_id 742250 non-null int64 \n", " 3 created_at 742250 non-null object\n", " 4 updated_at 742250 non-null object\n", " 5 number 742250 non-null object\n", " 6 identifier 742250 non-null object\n", "dtypes: int64(2), object(5)\n", "memory usage: 39.6+ MB\n" ] } ], "source": [ "df1_purchases.info()" ] }, { "cell_type": "code", "execution_count": 20, "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd", "metadata": {}, "outputs": [], "source": [ "# Nettoyage purchase_date\n", "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n", "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')" ] }, { "cell_type": "code", "execution_count": 21, "id": "27d18584-228f-4698-85d6-4d23151ea5ed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 742250 entries, 0 to 742249\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 742250 non-null int64 \n", " 1 purchase_date 742250 non-null datetime64[ns, UTC]\n", " 2 customer_id 742250 non-null int64 \n", " 3 created_at 742250 non-null object \n", " 4 updated_at 742250 non-null object \n", " 5 number 742250 non-null object \n", " 6 identifier 742250 non-null object \n", "dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n", "memory usage: 39.6+ MB\n" ] } ], "source": [ "df1_purchases.info()" ] }, { "cell_type": "code", "execution_count": 22, "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]" ] }, { "cell_type": "markdown", "id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a", "metadata": {}, "source": [ "## Fusion de l'ensemble des données billétiques" ] }, { "cell_type": "code", "execution_count": 23, "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7", "metadata": {}, "outputs": [], "source": [ "# Fusion avec fournisseurs\n", "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", "\n", "# Fusion avec type de tickets\n", "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", "\n", "# Fusion avec achats\n", "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
...........................
182666718643847350454FalsevadBillet en nombrepricing_formula2022-08-02 08:59:17+00:0041
182666819853111383564FalsevadBillet en nombrepricing_formula2022-11-04 14:25:42+00:0062763
182666919860514383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667019860515383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667119860516383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
\n", "

1826672 rows × 8 columns

\n", "
" ], "text/plain": [ " ticket_id product_id is_from_subscription supplier_name \\\n", "0 13070859 225251 False vente en ligne \n", "1 13070860 224914 False vente en ligne \n", "2 13070861 224914 False vente en ligne \n", "3 13070862 224914 False vente en ligne \n", "4 13070863 224914 False vente en ligne \n", "... ... ... ... ... \n", "1826667 18643847 350454 False vad \n", "1826668 19853111 383564 False vad \n", "1826669 19860514 383751 False vad \n", "1826670 19860515 383751 False vad \n", "1826671 19860516 383751 False vad \n", "\n", " type_of_ticket_name children purchase_date \\\n", "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "... ... ... ... \n", "1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n", "1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n", "1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "\n", " customer_id \n", "0 48187 \n", "1 48187 \n", "2 48187 \n", "3 48187 \n", "4 48187 \n", "... ... \n", "1826667 41 \n", "1826668 62763 \n", "1826669 1195566 \n", "1826670 1195566 \n", "1826671 1195566 \n", "\n", "[1826672 rows x 8 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_ticket_information" ] }, { "cell_type": "markdown", "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75", "metadata": {}, "source": [ "# Utilisation de fonctions" ] }, { "cell_type": "markdown", "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786", "metadata": {}, "source": [ "## Nettoyage, selection et fusion" ] }, { "cell_type": "code", "execution_count": 25, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], "source": [ "# Fonction de nettoyage et selection\n", "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", " # Base des tickets\n", " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "\n", " # Base des fournisseurs\n", " suppliers = suppliers[['id', 'name']]\n", " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", "\n", " # Base des types de billets\n", " type_ofs = type_ofs[['id', 'name', 'children']]\n", " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", "\n", " # Base des achats\n", " # Nettoyage de la date d'achat\n", " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n", " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n", " # Selection des variables\n", " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", "\n", " # Fusions \n", " # Fusion avec fournisseurs\n", " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", " \n", " # Fusion avec type de tickets\n", " ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", " \n", " # Fusion avec achats\n", " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", "\n", " return ticket_information" ] }, { "cell_type": "code", "execution_count": 28, "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", "/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", "/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" ] } ], "source": [ "df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" ] }, { "cell_type": "code", "execution_count": 29, "id": "2877f3de-55d6-42d6-ad94-352d3e107862", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
...........................
182666718643847350454FalsevadBillet en nombrepricing_formula2022-08-02 08:59:17+00:0041
182666819853111383564FalsevadBillet en nombrepricing_formula2022-11-04 14:25:42+00:0062763
182666919860514383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667019860515383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667119860516383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
\n", "

1826672 rows × 8 columns

\n", "
" ], "text/plain": [ " ticket_id product_id is_from_subscription supplier_name \\\n", "0 13070859 225251 False vente en ligne \n", "1 13070860 224914 False vente en ligne \n", "2 13070861 224914 False vente en ligne \n", "3 13070862 224914 False vente en ligne \n", "4 13070863 224914 False vente en ligne \n", "... ... ... ... ... \n", "1826667 18643847 350454 False vad \n", "1826668 19853111 383564 False vad \n", "1826669 19860514 383751 False vad \n", "1826670 19860515 383751 False vad \n", "1826671 19860516 383751 False vad \n", "\n", " type_of_ticket_name children purchase_date \\\n", "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", "... ... ... ... \n", "1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n", "1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n", "1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", "\n", " customer_id \n", "0 48187 \n", "1 48187 \n", "2 48187 \n", "3 48187 \n", "4 48187 \n", "... ... \n", "1826667 41 \n", "1826668 62763 \n", "1826669 1195566 \n", "1826670 1195566 \n", "1826671 1195566 \n", "\n", "[1826672 rows x 8 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_ticket_information" ] }, { "cell_type": "markdown", "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6", "metadata": {}, "source": [ "## Exploration variables" ] }, { "cell_type": "code", "execution_count": 41, "id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71", "metadata": {}, "outputs": [], "source": [ "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n", "def suppliers_exploration(suppliers = None) : \n", " \n", " # Taux de NaN pour ces colonnes\n", " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n", " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n", " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n", "\n", " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n", " 'label_na' : [label_na],\n", " 'itr_na' : [itr_na],\n", " 'commission_na' : [commission_na]})\n", "\n", " return suppliers_desc" ] }, { "cell_type": "code", "execution_count": 42, "id": "2fecc2e1-113f-46ed-9065-0b9ee416166e", "metadata": {}, "outputs": [], "source": [ "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)" ] }, { "cell_type": "code", "execution_count": 43, "id": "55f6170a-36fb-4efb-9810-f982883660cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_supplierslabel_naitr_nacommission_na
09100.0100.0100.0
\n", "
" ], "text/plain": [ " nb_suppliers label_na itr_na commission_na\n", "0 9 100.0 100.0 100.0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_suppliers_desc" ] }, { "cell_type": "code", "execution_count": 47, "id": "0030fd02-09e3-42f5-9c83-290458a38c29", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data\"\n", "liste_folders = fs.ls(BUCKET)\n", "\n", "liste_files = []\n", "for company_folder in liste_folders : \n", " liste_files.extend(fs.ls(company_folder))" ] }, { "cell_type": "code", "execution_count": 51, "id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n" ] } ], "source": [ "liste_database_select = ['suppliers']\n", "\n", "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n", "\n", "# Afficher le résultat\n", "print(liste_suppliers)" ] }, { "cell_type": "code", "execution_count": null, "id": "226b694b-0b00-4167-b69f-3178902254eb", "metadata": {}, "outputs": [], "source": [ "# loop to create dataframes from file 2\n", "def database_loading(database_name = None):\n", " files_path = database_name\n", " \n", " client_number = files_path[0].split(\"/\")[1]\n", " df_prefix = \"df\" + str(client_number) + \"_\"\n", " \n", " for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df\n", "\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }