2175 lines
78 KiB
Plaintext
2175 lines
78 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Business Data Challenge - Team 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "15103481-8d74-404c-aa09-7601fe7730da",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"import re"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
|
||
"metadata": {},
|
||
"source": [
|
||
"Configuration de l'accès aux données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Exemple sur Company 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Chargement données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data/1\"\n",
|
||
"liste_database = fs.ls(BUCKET)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
|
||
"\n",
|
||
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
||
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
|
||
"\n",
|
||
"# Afficher le résultat\n",
|
||
"print(liste_database_filtered)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" df = pd.read_csv(file_in)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# loop to create dataframes from file 2\n",
|
||
"files_path = liste_database_filtered\n",
|
||
"\n",
|
||
"client_number = files_path[0].split(\"/\")[1]\n",
|
||
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
"\n",
|
||
"for i in range(len(files_path)) :\n",
|
||
" current_path = files_path[i]\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
" # the pattern of the name is df1xxx\n",
|
||
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||
" globals()[nom_dataframe] = df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
|
||
"metadata": {},
|
||
"source": [
|
||
"## tickets.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>number</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>purchase_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>type_of</th>\n",
|
||
" <th>supplier_id</th>\n",
|
||
" <th>barcode</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>13593002661288</td>\n",
|
||
" <td>2021-12-28 20:47:10.320641+01:00</td>\n",
|
||
" <td>2022-02-14 18:46:53.614229+01:00</td>\n",
|
||
" <td>5107462</td>\n",
|
||
" <td>225251</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b6ad7fc36f33b5e05f58c7fca06688a6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070860</td>\n",
|
||
" <td>13593002661399</td>\n",
|
||
" <td>2021-12-28 20:47:10.321037+01:00</td>\n",
|
||
" <td>2022-02-14 18:46:53.614761+01:00</td>\n",
|
||
" <td>5107462</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b0903af480266f27802fe5c38c277c9e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070861</td>\n",
|
||
" <td>13593002661419</td>\n",
|
||
" <td>2021-12-28 20:47:10.321629+01:00</td>\n",
|
||
" <td>2022-02-14 18:46:53.615521+01:00</td>\n",
|
||
" <td>5107462</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>64ca12b7e26a65b90335c0702ea0faba</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070862</td>\n",
|
||
" <td>13593002661508</td>\n",
|
||
" <td>2021-12-28 20:47:10.322029+01:00</td>\n",
|
||
" <td>2022-02-14 18:46:53.616000+01:00</td>\n",
|
||
" <td>5107462</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>5ac2f8150aa9f3a6b1599df08cc2f0c7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070863</td>\n",
|
||
" <td>13593002661689</td>\n",
|
||
" <td>2021-12-28 20:47:10.322449+01:00</td>\n",
|
||
" <td>2022-02-14 18:46:53.616447+01:00</td>\n",
|
||
" <td>5107462</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>dfe30081bae020d12094279926136b9c</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826667</th>\n",
|
||
" <td>20662815</td>\n",
|
||
" <td>13593016154390</td>\n",
|
||
" <td>2023-11-09 07:51:34.935983+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:34.935983+01:00</td>\n",
|
||
" <td>8007697</td>\n",
|
||
" <td>405689</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>dba9aa428f843b79ae69dfacfe8fc579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826668</th>\n",
|
||
" <td>20662816</td>\n",
|
||
" <td>13593016154501</td>\n",
|
||
" <td>2023-11-09 07:51:34.937038+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:34.937038+01:00</td>\n",
|
||
" <td>8007698</td>\n",
|
||
" <td>403658</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>93f1fcfc6ba4fa68f92eb4b4a619fcf0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826669</th>\n",
|
||
" <td>20662817</td>\n",
|
||
" <td>13593016154680</td>\n",
|
||
" <td>2023-11-09 07:51:34.938224+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:34.938224+01:00</td>\n",
|
||
" <td>8007698</td>\n",
|
||
" <td>403658</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>c8bbbd25df2c158767ceef42c3237f23</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826670</th>\n",
|
||
" <td>20662818</td>\n",
|
||
" <td>13593016154899</td>\n",
|
||
" <td>2023-11-09 07:51:34.939328+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:34.939328+01:00</td>\n",
|
||
" <td>8007699</td>\n",
|
||
" <td>403658</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>738f0a8b5088b5056bc3b32eff2dca1f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826671</th>\n",
|
||
" <td>20662819</td>\n",
|
||
" <td>13593016154988</td>\n",
|
||
" <td>2023-11-09 07:51:34.940680+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:34.940680+01:00</td>\n",
|
||
" <td>8007699</td>\n",
|
||
" <td>403658</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4c5a6195434377380b4e6ae63b2e9cf6</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1826672 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id number created_at \\\n",
|
||
"0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n",
|
||
"1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n",
|
||
"2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n",
|
||
"3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n",
|
||
"4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n",
|
||
"1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n",
|
||
"1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n",
|
||
"1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n",
|
||
"1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n",
|
||
"\n",
|
||
" updated_at purchase_id product_id \\\n",
|
||
"0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n",
|
||
"1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n",
|
||
"2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n",
|
||
"3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n",
|
||
"4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n",
|
||
"1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n",
|
||
"1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n",
|
||
"1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n",
|
||
"1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n",
|
||
"\n",
|
||
" is_from_subscription type_of supplier_id barcode \\\n",
|
||
"0 False 1 3 NaN \n",
|
||
"1 False 1 3 NaN \n",
|
||
"2 False 1 3 NaN \n",
|
||
"3 False 1 3 NaN \n",
|
||
"4 False 1 3 NaN \n",
|
||
"... ... ... ... ... \n",
|
||
"1826667 False 1 3 NaN \n",
|
||
"1826668 False 1 3 NaN \n",
|
||
"1826669 False 1 3 NaN \n",
|
||
"1826670 False 1 3 NaN \n",
|
||
"1826671 False 1 3 NaN \n",
|
||
"\n",
|
||
" identifier \n",
|
||
"0 b6ad7fc36f33b5e05f58c7fca06688a6 \n",
|
||
"1 b0903af480266f27802fe5c38c277c9e \n",
|
||
"2 64ca12b7e26a65b90335c0702ea0faba \n",
|
||
"3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n",
|
||
"4 dfe30081bae020d12094279926136b9c \n",
|
||
"... ... \n",
|
||
"1826667 dba9aa428f843b79ae69dfacfe8fc579 \n",
|
||
"1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n",
|
||
"1826669 c8bbbd25df2c158767ceef42c3237f23 \n",
|
||
"1826670 738f0a8b5088b5056bc3b32eff2dca1f \n",
|
||
"1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n",
|
||
"\n",
|
||
"[1826672 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_tickets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 1826672 entries, 0 to 1826671\n",
|
||
"Data columns (total 11 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 id int64 \n",
|
||
" 1 number object \n",
|
||
" 2 created_at object \n",
|
||
" 3 updated_at object \n",
|
||
" 4 purchase_id int64 \n",
|
||
" 5 product_id int64 \n",
|
||
" 6 is_from_subscription bool \n",
|
||
" 7 type_of int64 \n",
|
||
" 8 supplier_id int64 \n",
|
||
" 9 barcode float64\n",
|
||
" 10 identifier object \n",
|
||
"dtypes: bool(1), float64(1), int64(5), object(4)\n",
|
||
"memory usage: 141.1+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_tickets.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "c1b42769-03c7-4785-92ce-5e1e6b41908d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"id 0.0\n",
|
||
"number 0.0\n",
|
||
"created_at 0.0\n",
|
||
"updated_at 0.0\n",
|
||
"purchase_id 0.0\n",
|
||
"product_id 0.0\n",
|
||
"is_from_subscription 0.0\n",
|
||
"type_of 0.0\n",
|
||
"supplier_id 0.0\n",
|
||
"barcode 100.0\n",
|
||
"identifier 0.0\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_tickets.isna().sum()/len(df1_tickets)*100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "42896791-2d93-4725-a50b-6c7cbe535ec7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||
"df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
|
||
"metadata": {},
|
||
"source": [
|
||
"## suppliers.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "2e0dada0-9457-484c-aa55-77e44613ecca",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>manually_added</th>\n",
|
||
" <th>label</th>\n",
|
||
" <th>itr</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>commission</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1617</td>\n",
|
||
" <td>j4 administration</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
|
||
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>5958b2a060ac3e31678b438892a1bd2e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>8</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
|
||
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1225483c97b36018cab2bea14ab78ea6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>fort saint jean</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>001b9b4a524fe407150b8235b304d4ec</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>j4</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6a0cf6edf20060344b465706b61719aa</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>revendeur</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>931239d4acb6214d7e5c98edecfb4916</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>bde8f2ccff510df8572d3214d86b837d</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>ccr</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>b48ec279411f7dbbb68393c61a9724d9</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>7</td>\n",
|
||
" <td>dab</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
|
||
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>11c6d471fa4e354e62e684d293694202</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name manually_added label itr \\\n",
|
||
"0 1617 j4 administration False NaN NaN \n",
|
||
"1 8 non défini False NaN NaN \n",
|
||
"2 4 vad False NaN NaN \n",
|
||
"3 1 fort saint jean False NaN NaN \n",
|
||
"4 2 j4 False NaN NaN \n",
|
||
"5 5 revendeur False NaN NaN \n",
|
||
"6 3 vente en ligne False NaN NaN \n",
|
||
"7 6 ccr False NaN NaN \n",
|
||
"8 7 dab False NaN NaN \n",
|
||
"\n",
|
||
" updated_at created_at \\\n",
|
||
"0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
|
||
"1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
|
||
"2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
|
||
"3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
|
||
"4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
|
||
"5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
|
||
"6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
|
||
"7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
|
||
"8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
|
||
"\n",
|
||
" commission identifier \n",
|
||
"0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
|
||
"1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
|
||
"2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
|
||
"3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
|
||
"4 NaN 6a0cf6edf20060344b465706b61719aa \n",
|
||
"5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
|
||
"6 NaN bde8f2ccff510df8572d3214d86b837d \n",
|
||
"7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
|
||
"8 NaN 11c6d471fa4e354e62e684d293694202 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "b583be02-ab60-4e14-9325-0204f203a1af",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 9 entries, 0 to 8\n",
|
||
"Data columns (total 9 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 9 non-null int64 \n",
|
||
" 1 name 9 non-null object \n",
|
||
" 2 manually_added 9 non-null bool \n",
|
||
" 3 label 0 non-null float64\n",
|
||
" 4 itr 0 non-null float64\n",
|
||
" 5 updated_at 9 non-null object \n",
|
||
" 6 created_at 9 non-null object \n",
|
||
" 7 commission 0 non-null float64\n",
|
||
" 8 identifier 9 non-null object \n",
|
||
"dtypes: bool(1), float64(3), int64(1), object(4)\n",
|
||
"memory usage: 713.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"id 0.0\n",
|
||
"name 0.0\n",
|
||
"manually_added 0.0\n",
|
||
"label 100.0\n",
|
||
"itr 100.0\n",
|
||
"updated_at 0.0\n",
|
||
"created_at 0.0\n",
|
||
"commission 100.0\n",
|
||
"identifier 0.0\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
|
||
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "4de7e2e2-6da4-4618-8444-b524399c5493",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1617</td>\n",
|
||
" <td>j4 administration</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>8</td>\n",
|
||
" <td>non défini</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>vad</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>fort saint jean</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>j4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>revendeur</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>ccr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>7</td>\n",
|
||
" <td>dab</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id supplier_name\n",
|
||
"0 1617 j4 administration\n",
|
||
"1 8 non défini\n",
|
||
"2 4 vad\n",
|
||
"3 1 fort saint jean\n",
|
||
"4 2 j4\n",
|
||
"5 5 revendeur\n",
|
||
"6 3 vente en ligne\n",
|
||
"7 6 ccr\n",
|
||
"8 7 dab"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers_clean"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0a6df975-c7fc-45bc-92af-a0bdab17d795",
|
||
"metadata": {},
|
||
"source": [
|
||
"## type_ofs.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-05 11:55:51.188106+01:00</td>\n",
|
||
" <td>2021-01-05 11:55:51.188106+01:00</td>\n",
|
||
" <td>623ec4067827558b28972cf39fe81ee7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-11 12:13:19.286301+01:00</td>\n",
|
||
" <td>2021-01-11 12:13:19.286301+01:00</td>\n",
|
||
" <td>a53d313a97296ee37caa066dbfe7a45c</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Groupe</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-11 12:19:22.842917+01:00</td>\n",
|
||
" <td>2021-01-11 12:19:22.842917+01:00</td>\n",
|
||
" <td>1ab143efc3b85acbbc752fe8eb2b0b86</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>Revendeur</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-12 12:34:20.481236+01:00</td>\n",
|
||
" <td>2021-01-12 12:34:20.481236+01:00</td>\n",
|
||
" <td>8b332723366a07e1eef5f1c92f9ae067</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>Cinéma scolaire</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-25 19:16:05.141719+01:00</td>\n",
|
||
" <td>2021-01-25 19:16:05.141719+01:00</td>\n",
|
||
" <td>a12e62cb4c4f47e7406bd8fbff2bfe30</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>Musée famille</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-25 19:23:06.692627+01:00</td>\n",
|
||
" <td>2021-01-25 19:23:06.692627+01:00</td>\n",
|
||
" <td>1ec6c19283111ccb3ed67f52d414470e</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>7</td>\n",
|
||
" <td>Spectacle famille</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-25 19:28:21.390016+01:00</td>\n",
|
||
" <td>2021-01-25 19:28:21.390016+01:00</td>\n",
|
||
" <td>05e2104f1b74ced229c06847d6e91938</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>8</td>\n",
|
||
" <td>Masterclass</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-25 19:31:05.076904+01:00</td>\n",
|
||
" <td>2021-01-25 19:31:05.076904+01:00</td>\n",
|
||
" <td>9cc946edfb25e11b4282f58db16e6ae9</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>Spectacle</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-01-25 19:38:41.260535+01:00</td>\n",
|
||
" <td>2021-01-25 19:38:41.260535+01:00</td>\n",
|
||
" <td>d88321c347f0e0ab101184cdf25c94bf</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Cinema</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-02-05 11:12:31.932576+01:00</td>\n",
|
||
" <td>2021-02-05 11:12:31.932576+01:00</td>\n",
|
||
" <td>0870fef2bfcd5b30a12e4f5c7f4aaba7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>11</td>\n",
|
||
" <td>Musee</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2021-02-05 11:52:05.468207+01:00</td>\n",
|
||
" <td>2021-02-05 11:52:05.468207+01:00</td>\n",
|
||
" <td>8ba8934454cc62c7cdb3eb6e1b39df0c</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>Tarifs plein</td>\n",
|
||
" <td>category</td>\n",
|
||
" <td>2023-03-13 11:31:50.528331+01:00</td>\n",
|
||
" <td>2023-03-13 11:31:50.528331+01:00</td>\n",
|
||
" <td>a6969df76efc15d157be48e87a7bcf9a</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name children created_at \\\n",
|
||
"0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n",
|
||
"1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n",
|
||
"2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n",
|
||
"3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n",
|
||
"4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n",
|
||
"5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n",
|
||
"6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n",
|
||
"7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n",
|
||
"8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n",
|
||
"9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n",
|
||
"10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n",
|
||
"11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n",
|
||
"\n",
|
||
" updated_at identifier \n",
|
||
"0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n",
|
||
"1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n",
|
||
"2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n",
|
||
"3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n",
|
||
"4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n",
|
||
"5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n",
|
||
"6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n",
|
||
"7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n",
|
||
"8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n",
|
||
"9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n",
|
||
"10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n",
|
||
"11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a "
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_type_ofs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 12 entries, 0 to 11\n",
|
||
"Data columns (total 6 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 12 non-null int64 \n",
|
||
" 1 name 12 non-null object\n",
|
||
" 2 children 12 non-null object\n",
|
||
" 3 created_at 12 non-null object\n",
|
||
" 4 updated_at 12 non-null object\n",
|
||
" 5 identifier 12 non-null object\n",
|
||
"dtypes: int64(1), object(5)\n",
|
||
"memory usage: 704.0+ bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_type_ofs.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
|
||
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
|
||
"metadata": {},
|
||
"source": [
|
||
"## purchases.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" <th>number</th>\n",
|
||
" <th>identifier</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>5145662</td>\n",
|
||
" <td>2019-07-17 11:17:53+02:00</td>\n",
|
||
" <td>6632</td>\n",
|
||
" <td>2021-12-28 20:48:51.569237+01:00</td>\n",
|
||
" <td>2021-12-28 20:48:51.569237+01:00</td>\n",
|
||
" <td>fa80c83b29a268b45728c910a8afcf79</td>\n",
|
||
" <td>82877c41df26f832eb823a83acd1a172</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4941642</td>\n",
|
||
" <td>2018-10-31 11:59:00+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-28 20:31:48.196681+01:00</td>\n",
|
||
" <td>2022-03-03 17:52:21.958861+01:00</td>\n",
|
||
" <td>597b6c06adfe6acc539b29b657b80da0</td>\n",
|
||
" <td>e7102ebe65526c427245533ebabe66e5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5088860</td>\n",
|
||
" <td>2018-10-31 12:45:12+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-28 20:46:34.703542+01:00</td>\n",
|
||
" <td>2021-12-28 20:46:34.703542+01:00</td>\n",
|
||
" <td>4a7f6baaf9be6a99e3fead7f7e981fa8</td>\n",
|
||
" <td>af75c4ae53d1b6957875538355b162e1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>5088862</td>\n",
|
||
" <td>2018-10-31 13:07:12+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-28 20:46:34.704773+01:00</td>\n",
|
||
" <td>2021-12-28 20:46:34.704773+01:00</td>\n",
|
||
" <td>1d83dfad44b73070d1c6d5875d0edd2d</td>\n",
|
||
" <td>4b2fe34659b177209b07270ae1043b40</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5088863</td>\n",
|
||
" <td>2018-10-31 13:08:50+01:00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-28 20:46:34.705453+01:00</td>\n",
|
||
" <td>2021-12-28 20:46:34.705453+01:00</td>\n",
|
||
" <td>7bfe2bc9c1670c973d0960e3fd408cf8</td>\n",
|
||
" <td>b115f04a99b94df9e4a32185844f0998</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>742245</th>\n",
|
||
" <td>8007695</td>\n",
|
||
" <td>2023-11-08 17:51:19+01:00</td>\n",
|
||
" <td>1256133</td>\n",
|
||
" <td>2023-11-09 07:51:33.920187+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:33.920187+01:00</td>\n",
|
||
" <td>99ad774dedbad43feb73514765d2f0ba</td>\n",
|
||
" <td>d68558180b4bf2e8a945724843655775</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>742246</th>\n",
|
||
" <td>8007696</td>\n",
|
||
" <td>2023-11-08 18:17:51+01:00</td>\n",
|
||
" <td>1256134</td>\n",
|
||
" <td>2023-11-09 07:51:33.921967+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:33.921967+01:00</td>\n",
|
||
" <td>c1511614c511c5f95980172690179102</td>\n",
|
||
" <td>f5102d910a7731091f239ad7b0df35b4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>742247</th>\n",
|
||
" <td>8007697</td>\n",
|
||
" <td>2023-11-08 18:23:54+01:00</td>\n",
|
||
" <td>1256135</td>\n",
|
||
" <td>2023-11-09 07:51:33.923034+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:33.923034+01:00</td>\n",
|
||
" <td>33b64b39cc53428b4f17d65ff5b93104</td>\n",
|
||
" <td>e2b917626be60cc2c3207cc037fe69e4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>742248</th>\n",
|
||
" <td>8007698</td>\n",
|
||
" <td>2023-11-08 19:32:18+01:00</td>\n",
|
||
" <td>1256136</td>\n",
|
||
" <td>2023-11-09 07:51:33.924135+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:33.924135+01:00</td>\n",
|
||
" <td>9ae0b129e704b3d9c093ce9c7c4e5039</td>\n",
|
||
" <td>5bfa23236c31f8562c3a0233c1b53b31</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>742249</th>\n",
|
||
" <td>8007699</td>\n",
|
||
" <td>2023-11-08 20:30:28+01:00</td>\n",
|
||
" <td>1256137</td>\n",
|
||
" <td>2023-11-09 07:51:33.925382+01:00</td>\n",
|
||
" <td>2023-11-09 07:51:33.925382+01:00</td>\n",
|
||
" <td>d31ced089c2b1f90479257a4686f9306</td>\n",
|
||
" <td>d86b1e0de3ff01eaf04fbcd031ac5fef</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>742250 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id purchase_date customer_id \\\n",
|
||
"0 5145662 2019-07-17 11:17:53+02:00 6632 \n",
|
||
"1 4941642 2018-10-31 11:59:00+01:00 1 \n",
|
||
"2 5088860 2018-10-31 12:45:12+01:00 1 \n",
|
||
"3 5088862 2018-10-31 13:07:12+01:00 1 \n",
|
||
"4 5088863 2018-10-31 13:08:50+01:00 1 \n",
|
||
"... ... ... ... \n",
|
||
"742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n",
|
||
"742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n",
|
||
"742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n",
|
||
"742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n",
|
||
"742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n",
|
||
"\n",
|
||
" created_at updated_at \\\n",
|
||
"0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n",
|
||
"1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n",
|
||
"2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n",
|
||
"3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n",
|
||
"4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n",
|
||
"... ... ... \n",
|
||
"742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n",
|
||
"742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n",
|
||
"742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n",
|
||
"742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n",
|
||
"742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n",
|
||
"\n",
|
||
" number identifier \n",
|
||
"0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n",
|
||
"1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n",
|
||
"2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n",
|
||
"3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n",
|
||
"4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n",
|
||
"... ... ... \n",
|
||
"742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n",
|
||
"742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n",
|
||
"742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n",
|
||
"742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n",
|
||
"742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n",
|
||
"\n",
|
||
"[742250 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_purchases"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 742250 entries, 0 to 742249\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 742250 non-null int64 \n",
|
||
" 1 purchase_date 742250 non-null object\n",
|
||
" 2 customer_id 742250 non-null int64 \n",
|
||
" 3 created_at 742250 non-null object\n",
|
||
" 4 updated_at 742250 non-null object\n",
|
||
" 5 number 742250 non-null object\n",
|
||
" 6 identifier 742250 non-null object\n",
|
||
"dtypes: int64(2), object(5)\n",
|
||
"memory usage: 39.6+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_purchases.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Nettoyage purchase_date\n",
|
||
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
|
||
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "27d18584-228f-4698-85d6-4d23151ea5ed",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 742250 entries, 0 to 742249\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 742250 non-null int64 \n",
|
||
" 1 purchase_date 742250 non-null datetime64[ns, UTC]\n",
|
||
" 2 customer_id 742250 non-null int64 \n",
|
||
" 3 created_at 742250 non-null object \n",
|
||
" 4 updated_at 742250 non-null object \n",
|
||
" 5 number 742250 non-null object \n",
|
||
" 6 identifier 742250 non-null object \n",
|
||
"dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n",
|
||
"memory usage: 39.6+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_purchases.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Fusion de l'ensemble des données billétiques"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec fournisseurs\n",
|
||
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
"# Fusion avec type de tickets\n",
|
||
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
"# Fusion avec achats\n",
|
||
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ticket_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" <th>type_of_ticket_name</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>225251</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070860</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070861</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070862</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070863</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826667</th>\n",
|
||
" <td>18643847</td>\n",
|
||
" <td>350454</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-08-02 08:59:17+00:00</td>\n",
|
||
" <td>41</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826668</th>\n",
|
||
" <td>19853111</td>\n",
|
||
" <td>383564</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-04 14:25:42+00:00</td>\n",
|
||
" <td>62763</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826669</th>\n",
|
||
" <td>19860514</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826670</th>\n",
|
||
" <td>19860515</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826671</th>\n",
|
||
" <td>19860516</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1826672 rows × 8 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
||
"0 13070859 225251 False vente en ligne \n",
|
||
"1 13070860 224914 False vente en ligne \n",
|
||
"2 13070861 224914 False vente en ligne \n",
|
||
"3 13070862 224914 False vente en ligne \n",
|
||
"4 13070863 224914 False vente en ligne \n",
|
||
"... ... ... ... ... \n",
|
||
"1826667 18643847 350454 False vad \n",
|
||
"1826668 19853111 383564 False vad \n",
|
||
"1826669 19860514 383751 False vad \n",
|
||
"1826670 19860515 383751 False vad \n",
|
||
"1826671 19860516 383751 False vad \n",
|
||
"\n",
|
||
" type_of_ticket_name children purchase_date \\\n",
|
||
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n",
|
||
"1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n",
|
||
"1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"\n",
|
||
" customer_id \n",
|
||
"0 48187 \n",
|
||
"1 48187 \n",
|
||
"2 48187 \n",
|
||
"3 48187 \n",
|
||
"4 48187 \n",
|
||
"... ... \n",
|
||
"1826667 41 \n",
|
||
"1826668 62763 \n",
|
||
"1826669 1195566 \n",
|
||
"1826670 1195566 \n",
|
||
"1826671 1195566 \n",
|
||
"\n",
|
||
"[1826672 rows x 8 columns]"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Utilisation de fonctions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Nettoyage, selection et fusion"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fonction de nettoyage et selection\n",
|
||
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
||
" # Base des tickets\n",
|
||
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des fournisseurs\n",
|
||
" suppliers = suppliers[['id', 'name']]\n",
|
||
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des types de billets\n",
|
||
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
||
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
||
"\n",
|
||
" # Base des achats\n",
|
||
" # Nettoyage de la date d'achat\n",
|
||
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n",
|
||
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n",
|
||
" # Selection des variables\n",
|
||
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
||
"\n",
|
||
" # Fusions \n",
|
||
" # Fusion avec fournisseurs\n",
|
||
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec type de tickets\n",
|
||
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||
" \n",
|
||
" # Fusion avec achats\n",
|
||
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
" return ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||
"/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||
"/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "2877f3de-55d6-42d6-ad94-352d3e107862",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ticket_id</th>\n",
|
||
" <th>product_id</th>\n",
|
||
" <th>is_from_subscription</th>\n",
|
||
" <th>supplier_name</th>\n",
|
||
" <th>type_of_ticket_name</th>\n",
|
||
" <th>children</th>\n",
|
||
" <th>purchase_date</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>13070859</td>\n",
|
||
" <td>225251</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13070860</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13070861</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>13070862</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13070863</td>\n",
|
||
" <td>224914</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vente en ligne</td>\n",
|
||
" <td>Atelier</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||
" <td>48187</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826667</th>\n",
|
||
" <td>18643847</td>\n",
|
||
" <td>350454</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-08-02 08:59:17+00:00</td>\n",
|
||
" <td>41</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826668</th>\n",
|
||
" <td>19853111</td>\n",
|
||
" <td>383564</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-04 14:25:42+00:00</td>\n",
|
||
" <td>62763</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826669</th>\n",
|
||
" <td>19860514</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826670</th>\n",
|
||
" <td>19860515</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1826671</th>\n",
|
||
" <td>19860516</td>\n",
|
||
" <td>383751</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>vad</td>\n",
|
||
" <td>Billet en nombre</td>\n",
|
||
" <td>pricing_formula</td>\n",
|
||
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||
" <td>1195566</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1826672 rows × 8 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
||
"0 13070859 225251 False vente en ligne \n",
|
||
"1 13070860 224914 False vente en ligne \n",
|
||
"2 13070861 224914 False vente en ligne \n",
|
||
"3 13070862 224914 False vente en ligne \n",
|
||
"4 13070863 224914 False vente en ligne \n",
|
||
"... ... ... ... ... \n",
|
||
"1826667 18643847 350454 False vad \n",
|
||
"1826668 19853111 383564 False vad \n",
|
||
"1826669 19860514 383751 False vad \n",
|
||
"1826670 19860515 383751 False vad \n",
|
||
"1826671 19860516 383751 False vad \n",
|
||
"\n",
|
||
" type_of_ticket_name children purchase_date \\\n",
|
||
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||
"... ... ... ... \n",
|
||
"1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n",
|
||
"1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n",
|
||
"1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||
"\n",
|
||
" customer_id \n",
|
||
"0 48187 \n",
|
||
"1 48187 \n",
|
||
"2 48187 \n",
|
||
"3 48187 \n",
|
||
"4 48187 \n",
|
||
"... ... \n",
|
||
"1826667 41 \n",
|
||
"1826668 62763 \n",
|
||
"1826669 1195566 \n",
|
||
"1826670 1195566 \n",
|
||
"1826671 1195566 \n",
|
||
"\n",
|
||
"[1826672 rows x 8 columns]"
|
||
]
|
||
},
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Exploration variables"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
|
||
"def suppliers_exploration(suppliers = None) : \n",
|
||
" \n",
|
||
" # Taux de NaN pour ces colonnes\n",
|
||
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
|
||
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
|
||
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
|
||
"\n",
|
||
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
|
||
" 'label_na' : [label_na],\n",
|
||
" 'itr_na' : [itr_na],\n",
|
||
" 'commission_na' : [commission_na]})\n",
|
||
"\n",
|
||
" return suppliers_desc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "55f6170a-36fb-4efb-9810-f982883660cf",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>label_na</th>\n",
|
||
" <th>itr_na</th>\n",
|
||
" <th>commission_na</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" nb_suppliers label_na itr_na commission_na\n",
|
||
"0 9 100.0 100.0 100.0"
|
||
]
|
||
},
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers_desc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"id": "0030fd02-09e3-42f5-9c83-290458a38c29",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data\"\n",
|
||
"liste_folders = fs.ls(BUCKET)\n",
|
||
"\n",
|
||
"liste_files = []\n",
|
||
"for company_folder in liste_folders : \n",
|
||
" liste_files.extend(fs.ls(company_folder))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"liste_database_select = ['suppliers']\n",
|
||
"\n",
|
||
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
||
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
|
||
"\n",
|
||
"# Afficher le résultat\n",
|
||
"print(liste_suppliers)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "226b694b-0b00-4167-b69f-3178902254eb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# loop to create dataframes from file 2\n",
|
||
"def database_loading(database_name = None):\n",
|
||
" files_path = database_name\n",
|
||
" \n",
|
||
" client_number = files_path[0].split(\"/\")[1]\n",
|
||
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
" \n",
|
||
" for i in range(len(files_path)) :\n",
|
||
" current_path = files_path[i]\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
" # the pattern of the name is df1xxx\n",
|
||
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||
" globals()[nom_dataframe] = df\n",
|
||
"\n",
|
||
" "
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|