BDC-team-1/Exploration_billet_AJ.ipynb

2256 lines
79 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 1,
2024-02-10 22:46:56 +01:00
"id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-01-13 10:38:10 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 2,
2024-02-10 22:46:56 +01:00
"id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
2024-01-13 10:38:10 +01:00
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
2024-02-25 18:33:24 +01:00
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Debut Travail 25/02"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"\n",
"# Import cleaning and merge functions\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>fidelity</th>\n",
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>...</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6009745</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6011228</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6058950</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6062404</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>250217</td>\n",
" <td>78785</td>\n",
" <td>NaN</td>\n",
" <td>11035.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471593</th>\n",
" <td>4976621</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4732462.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471594</th>\n",
" <td>4976636</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4731717.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471595</th>\n",
" <td>4976637</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4731674.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471596</th>\n",
" <td>4976645</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4731549.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471597</th>\n",
" <td>4976666</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4731118.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>471598 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 6009745 1372685 NaN NaN 0 \n",
"1 6011228 1372685 NaN NaN 0 \n",
"2 6058950 1372685 NaN NaN 0 \n",
"3 6062404 1372685 NaN NaN 0 \n",
"4 250217 78785 NaN 11035.0 0 \n",
"... ... ... ... ... ... \n",
"471593 4976621 3 NaN 4732462.0 0 \n",
"471594 4976636 3 NaN 4731717.0 0 \n",
"471595 4976637 3 NaN 4731674.0 0 \n",
"471596 4976645 3 NaN 4731549.0 0 \n",
"471597 4976666 3 NaN 4731118.0 0 \n",
"\n",
" tenant_id is_partner deleted_at gender is_email_true ... \\\n",
"0 1771 False NaN 2 True ... \n",
"1 1771 False NaN 2 True ... \n",
"2 1771 False NaN 2 True ... \n",
"3 1771 False NaN 2 True ... \n",
"4 1771 False NaN 0 True ... \n",
"... ... ... ... ... ... ... \n",
"471593 1771 False NaN 0 True ... \n",
"471594 1771 False NaN 2 True ... \n",
"471595 1771 False NaN 0 True ... \n",
"471596 1771 False NaN 2 True ... \n",
"471597 1771 False NaN 0 True ... \n",
"\n",
" max_price ticket_sum average_price average_purchase_delay \\\n",
"0 NaN 0 NaN NaN \n",
"1 NaN 0 NaN NaN \n",
"2 NaN 0 NaN NaN \n",
"3 NaN 0 NaN NaN \n",
"4 NaN 0 0.0 NaN \n",
"... ... ... ... ... \n",
"471593 NaN 0 NaN NaN \n",
"471594 NaN 0 NaN NaN \n",
"471595 NaN 0 NaN NaN \n",
"471596 NaN 0 NaN NaN \n",
"471597 NaN 0 NaN NaN \n",
"\n",
" average_price_basket average_ticket_basket total_price \\\n",
"0 NaN NaN 0.0 \n",
"1 NaN NaN 0.0 \n",
"2 NaN NaN 0.0 \n",
"3 NaN NaN 0.0 \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"471593 NaN NaN 0.0 \n",
"471594 NaN NaN 0.0 \n",
"471595 NaN NaN 0.0 \n",
"471596 NaN NaN 0.0 \n",
"471597 NaN NaN 0.0 \n",
"\n",
" purchase_count first_buying_date country \n",
"0 0 NaN af \n",
"1 0 NaN af \n",
"2 0 NaN af \n",
"3 0 NaN af \n",
"4 0 NaN fr \n",
"... ... ... ... \n",
"471593 0 NaN NaN \n",
"471594 0 NaN NaN \n",
"471595 0 NaN NaN \n",
"471596 0 NaN NaN \n",
"471597 0 NaN NaN \n",
"\n",
"[471598 rows x 22 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"display_databases(\"5\", \"customerplus_cleaned\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>code</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101</td>\n",
" <td>hongrie</td>\n",
" <td>hu</td>\n",
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>albanie</td>\n",
" <td>al</td>\n",
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>antarctique</td>\n",
" <td>aq</td>\n",
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>12</td>\n",
" <td>autriche</td>\n",
" <td>at</td>\n",
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>samoa américaines</td>\n",
" <td>as</td>\n",
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238</th>\n",
" <td>228</td>\n",
" <td>royaume-uni</td>\n",
" <td>gb</td>\n",
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>239</th>\n",
" <td>25</td>\n",
" <td>brésil</td>\n",
" <td>br</td>\n",
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240</th>\n",
" <td>10</td>\n",
" <td>argentine</td>\n",
" <td>ar</td>\n",
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>241</th>\n",
" <td>203</td>\n",
" <td>espagne</td>\n",
" <td>es</td>\n",
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>242</th>\n",
" <td>192</td>\n",
" <td>arabie saoudite</td>\n",
" <td>sa</td>\n",
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>243 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" id name code created_at \\\n",
"0 101 hongrie hu 2023-06-13 11:17:40.600622+02:00 \n",
"1 2 albanie al 2023-06-13 11:17:40.540652+02:00 \n",
"2 3 antarctique aq 2023-06-13 11:17:40.541315+02:00 \n",
"3 12 autriche at 2023-06-13 11:17:40.546711+02:00 \n",
"4 5 samoa américaines as 2023-06-13 11:17:40.542569+02:00 \n",
".. ... ... ... ... \n",
"238 228 royaume-uni gb 2023-06-13 11:17:40.678023+02:00 \n",
"239 25 brésil br 2023-06-13 11:17:40.554209+02:00 \n",
"240 10 argentine ar 2023-06-13 11:17:40.545489+02:00 \n",
"241 203 espagne es 2023-06-13 11:17:40.662472+02:00 \n",
"242 192 arabie saoudite sa 2023-06-13 11:17:40.656154+02:00 \n",
"\n",
" updated_at \n",
"0 2023-06-13 11:17:40.600622+02:00 \n",
"1 2023-06-13 11:17:40.540652+02:00 \n",
"2 2023-06-13 11:17:40.541315+02:00 \n",
"3 2023-06-13 11:17:40.546711+02:00 \n",
"4 2023-06-13 11:17:40.542569+02:00 \n",
".. ... \n",
"238 2023-06-13 11:17:40.678023+02:00 \n",
"239 2023-06-13 11:17:40.554209+02:00 \n",
"240 2023-06-13 11:17:40.545489+02:00 \n",
"241 2023-06-13 11:17:40.662472+02:00 \n",
"242 2023-06-13 11:17:40.656154+02:00 \n",
"\n",
"[243 rows x 5 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_dataset_2(\"7\", \"countries\")"
]
},
{
"cell_type": "markdown",
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Base communes au types Musée"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
"metadata": {},
"outputs": [],
"source": [
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
"metadata": {},
"outputs": [],
"source": [
"companies['musee']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
"metadata": {},
"outputs": [],
"source": [
"for i in companies['musique']:\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" liste_base = []\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" liste_base.append(nom_base)\n",
" globals()['base_'+i] = liste_base\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
"metadata": {},
"outputs": [],
"source": [
"len(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0aa8976-1487-4ef5-898e-0d6a88183e67",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "67f02868-b16a-41d5-a0f9-b31ce09278db",
"metadata": {},
"outputs": [],
"source": [
"base_101"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
"metadata": {},
2024-02-25 18:33:24 +01:00
"outputs": [],
"source": [
"df1_tags = load_dataset_2(\"1\", \"tags\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa441f99-733c-4675-8676-bed4682d3324",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125e9145-a815-46fd-bdf4-07589508b259",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
"metadata": {},
"outputs": [],
"source": [
"len(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
"metadata": {},
"outputs": [],
"source": [
"?np.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
"metadata": {},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"print(df1_tags['name'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(df1_tags['name'])"
]
},
{
"cell_type": "markdown",
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
"metadata": {},
"source": [
"## KPI sur target_type"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "622752ed-b565-4188-86d6-38f1f333fcbe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n"
]
},
{
"ename": "PermissionError",
"evalue": "Forbidden",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:529\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 529\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_s3\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43ms3\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhead_object\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBucket\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbucket\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mKey\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mversion_id_kw\u001b[49m\u001b[43m(\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreq_kw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 532\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m: out[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 533\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mKey\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([bucket, key]),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m: out\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 541\u001b[0m }\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:200\u001b[0m, in \u001b[0;36mS3FileSystem._call_s3\u001b[0;34m(self, method, *akwarglist, **kwargs)\u001b[0m\n\u001b[1;32m 198\u001b[0m additional_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_s3_method_kwargs(method, \u001b[38;5;241m*\u001b[39makwarglist,\n\u001b[1;32m 199\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:553\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.<locals>._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:1009\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 1008\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m-> 1009\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 1010\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"\u001b[0;31mClientError\u001b[0m: An error occurred (403) when calling the HeadObject operation: Forbidden",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[74], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdisplay_databases\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtarget_information\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m<string>:12\u001b[0m, in \u001b[0;36mdisplay_databases\u001b[0;34m(directory_path, file_name, datetime_col)\u001b[0m\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1295\u001b[0m, in \u001b[0;36mAbstractFileSystem.open\u001b[0;34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[0m\n\u001b[1;32m 1293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1294\u001b[0m ac \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mautocommit\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intrans)\n\u001b[0;32m-> 1295\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1296\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1297\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1298\u001b[0m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1299\u001b[0m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1300\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1301\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1302\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1304\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompression\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compr\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:375\u001b[0m, in \u001b[0;36mS3FileSystem._open\u001b[0;34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, **kwargs)\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 373\u001b[0m cache_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_cache_type\n\u001b[0;32m--> 375\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 378\u001b[0m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequester_pays\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:1096\u001b[0m, in \u001b[0;36mS3File.__init__\u001b[0;34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays)\u001b[0m\n\u001b[1;32m 1094\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3_additional_kwargs \u001b[38;5;241m=\u001b[39m s3_additional_kwargs \u001b[38;5;129;01mor\u001b[39;00m {}\n\u001b[1;32m 1095\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreq_kw \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRequestPayer\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrequester\u001b[39m\u001b[38;5;124m'\u001b[39m} \u001b[38;5;28;01mif\u001b[39;00m requester_pays \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[0;32m-> 1096\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1097\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1098\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfs \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[1;32m 1099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwritable():\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1651\u001b[0m, in \u001b[0;36mAbstractBufferedFile.__init__\u001b[0;34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[0m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m size\n\u001b[1;32m 1650\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1651\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdetails\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;241m=\u001b[39m caches[cache_type](\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fetch_range, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcache_options\n\u001b[1;32m 1654\u001b[0m )\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1664\u001b[0m, in \u001b[0;36mAbstractBufferedFile.details\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1664\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details\n",
"File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:548\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(S3FileSystem, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39minfo(path)\n\u001b[1;32m 547\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 548\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ee\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ParamValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 550\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFailed to head path \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m%\u001b[39m (path, e))\n",
"\u001b[0;31mPermissionError\u001b[0m: Forbidden"
]
}
],
"source": [
"display_databases('1', 'target_information')"
]
},
{
"cell_type": "markdown",
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
"metadata": {},
"source": [
"# Fin travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "c437eaec",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
2024-01-13 10:38:10 +01:00
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "a1c1fc39",
2024-01-13 10:38:10 +01:00
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
2024-01-13 14:14:11 +01:00
"execution_count": 3,
2024-02-10 22:46:56 +01:00
"id": "66f8c17b",
"metadata": {},
2024-01-13 10:38:10 +01:00
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
2024-02-05 22:04:02 +01:00
"execution_count": 5,
2024-02-10 22:46:56 +01:00
"id": "c08e6798",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
2024-01-13 14:14:11 +01:00
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
2024-01-13 10:38:10 +01:00
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 6,
2024-02-10 22:46:56 +01:00
"id": "675f518d",
2024-01-13 10:38:10 +01:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-02-10 22:46:56 +01:00
"<<<<<<< local <modified: >\n",
2024-02-10 13:23:44 +01:00
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-10 22:46:56 +01:00
" df = pd.read_csv(file_in)\n",
"=======\n",
2024-02-07 23:28:55 +01:00
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-10 22:46:56 +01:00
" df = pd.read_csv(file_in)\n",
">>>>>>> remote <modified: >\n"
2024-01-13 10:38:10 +01:00
]
}
],
"source": [
2024-02-04 16:02:01 +01:00
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
2024-01-13 10:38:10 +01:00
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
2024-02-05 22:04:02 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "e855f403",
2024-02-19 23:11:28 +01:00
"metadata": {},
2024-02-05 22:04:02 +01:00
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "91a8f8c4",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
2024-02-10 22:46:56 +01:00
"id": "2fda171d",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
2024-02-10 22:46:56 +01:00
"id": "205eeeab",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
2024-02-10 22:46:56 +01:00
"id": "634282c5",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
2024-02-10 22:46:56 +01:00
"id": "0e8d4133",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
2024-02-10 22:46:56 +01:00
"id": "1268ad5a",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
2024-02-10 22:46:56 +01:00
"id": "bd41dc80",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "64d0f76b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
2024-02-10 22:46:56 +01:00
"id": "7e683711",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": 7,
2024-02-10 22:46:56 +01:00
"id": "e7b9a52e",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "568280e8",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
2024-02-10 22:46:56 +01:00
"id": "29ecec90",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
2024-02-05 22:04:02 +01:00
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
2024-01-13 10:38:10 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "22bb5de4",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 10:38:10 +01:00
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "6a9a91f4",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 11,
2024-02-10 22:46:56 +01:00
"id": "bab4758a",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
2024-02-10 22:46:56 +01:00
"id": "b5fff251",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
2024-01-13 10:38:10 +01:00
]
},
2024-01-13 14:14:11 +01:00
{
"cell_type": "code",
"execution_count": 13,
2024-02-10 22:46:56 +01:00
"id": "8b09e2a3",
2024-01-13 14:14:11 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
2024-01-13 10:38:10 +01:00
]
},
{
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "ecee7cdc",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "c8e6e69b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
2024-02-10 22:46:56 +01:00
"id": "1a6cff1f",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": 16,
2024-02-10 22:46:56 +01:00
"id": "93630b41",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
2024-02-10 22:46:56 +01:00
"id": "4f94481a",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "1b2811e2",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
2024-02-10 22:46:56 +01:00
"id": "2455d2e1",
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": 19,
2024-02-10 22:46:56 +01:00
"id": "5f9a159d",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
2024-02-10 22:46:56 +01:00
"id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 21,
2024-02-10 22:46:56 +01:00
"id": "bd436fca",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "83435862",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "f210e730",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": 23,
2024-02-10 22:46:56 +01:00
"id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
2024-02-10 22:46:56 +01:00
"id": "83a4d021",
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "56e6ebd1",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 51,
2024-02-10 22:46:56 +01:00
"id": "88fcde4b",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
2024-02-04 16:02:01 +01:00
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "818f69db",
2024-02-04 16:02:01 +01:00
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 23,
2024-02-10 22:46:56 +01:00
"id": "c9654eda",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
"source": [
"df1_ticket_information"
]
},
2024-02-04 16:02:01 +01:00
{
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "7f2b620c",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "637bdb72",
"metadata": {},
"source": [
2024-02-04 16:02:01 +01:00
"# Customer information"
]
},
{
2024-02-04 16:02:01 +01:00
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "14c52894",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
2024-02-04 16:02:01 +01:00
"## Target area"
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "d83abfbf",
"metadata": {},
2024-02-04 16:02:01 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
2024-02-04 16:02:01 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
2024-02-04 16:02:01 +01:00
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
2024-02-04 16:02:01 +01:00
"execution_count": 62,
2024-02-10 22:46:56 +01:00
"id": "90d71b2c",
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "2301de1e",
2024-02-07 23:28:55 +01:00
"metadata": {},
2024-02-04 16:02:01 +01:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
2024-02-07 23:28:55 +01:00
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
2024-02-04 16:02:01 +01:00
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-02-07 23:28:55 +01:00
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
2024-02-04 16:02:01 +01:00
"\n",
2024-02-07 23:28:55 +01:00
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
2024-02-04 16:02:01 +01:00
]
},
2024-02-07 23:28:55 +01:00
"execution_count": 10,
2024-02-04 16:02:01 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-02-07 23:28:55 +01:00
"df1_targets_full.head()"
2024-02-04 16:02:01 +01:00
]
},
{
2024-02-07 23:28:55 +01:00
"cell_type": "code",
"execution_count": 14,
2024-02-10 22:46:56 +01:00
"id": "75fbc2f7",
2024-02-07 23:28:55 +01:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 19,
2024-02-10 22:46:56 +01:00
"id": "55cddf92",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
2024-02-07 23:28:55 +01:00
"name": "stdout",
2024-02-04 16:02:01 +01:00
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
2024-02-04 16:02:01 +01:00
]
}
],
"source": [
2024-02-07 23:28:55 +01:00
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
2024-02-04 16:02:01 +01:00
"\n",
"\n",
2024-02-07 23:28:55 +01:00
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 22,
2024-02-10 22:46:56 +01:00
"id": "7fd98a85",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-02-07 23:28:55 +01:00
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
2024-02-04 16:02:01 +01:00
]
}
],
"source": [
2024-02-07 23:28:55 +01:00
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-02-07 23:28:55 +01:00
"execution_count": 18,
2024-02-10 22:46:56 +01:00
"id": "cf94bb1d",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
2024-02-07 23:28:55 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
2024-02-04 16:02:01 +01:00
}
],
2024-02-07 23:28:55 +01:00
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
2024-02-10 22:46:56 +01:00
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3fc1f446",
"metadata": {},
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "20e69ee3",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": 37,
2024-02-10 22:46:56 +01:00
"id": "d9cbdbce",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": 38,
2024-02-10 22:46:56 +01:00
"id": "c07459f0",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "80ae4c42",
2024-02-10 13:23:44 +01:00
"metadata": {},
2024-02-04 16:02:01 +01:00
"source": [
"## Exploration variables"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 7,
2024-02-10 22:46:56 +01:00
"id": "b50b8f95",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 8,
2024-02-10 22:46:56 +01:00
"id": "7e292935",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 9,
2024-02-10 22:46:56 +01:00
"id": "05b6f2b0",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
2024-02-10 13:23:44 +01:00
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 10,
2024-02-10 22:46:56 +01:00
"id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 11,
2024-02-10 22:46:56 +01:00
"id": "10304058",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
2024-02-10 13:23:44 +01:00
"execution_count": 32,
2024-02-10 22:46:56 +01:00
"id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
2024-02-10 13:23:44 +01:00
" client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
2024-02-10 13:23:44 +01:00
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "70bdc88d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
2024-02-10 22:46:56 +01:00
"id": "6a0f567d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
"\n",
2024-02-10 13:23:44 +01:00
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
2024-02-10 13:23:44 +01:00
},
{
"cell_type": "code",
"execution_count": 63,
2024-02-10 22:46:56 +01:00
"id": "1522d8cd",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 66,
2024-02-10 22:46:56 +01:00
"id": "b0e42a61",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
2024-02-10 22:46:56 +01:00
"id": "d299ae91",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tenant_id\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"5 1\n",
"6 1\n",
"7 1\n",
"8 1\n",
"9 1\n",
"10 1\n",
"11 1\n",
"12 1\n",
"13 1\n",
"14 1\n",
"101 1\n",
"Name: canal_vente_internet, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-02-19 23:11:28 +01:00
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}