BDC-team-1/Exploration_billet_AJ.ipynb

3362 lines
100 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
"metadata": {},
"source": [
"# Debut Travail 25/02"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"\n",
"# Import cleaning and merge functions\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_4/customerplus_cleaned.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>fidelity</th>\n",
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>...</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>479734</td>\n",
" <td>3587</td>\n",
" <td>NaN</td>\n",
" <td>184801.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1537</td>\n",
" <td>1352</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>504615</td>\n",
" <td>3587</td>\n",
" <td>NaN</td>\n",
" <td>152176.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3832780</td>\n",
" <td>3587</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3096540</td>\n",
" <td>3587</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320804</th>\n",
" <td>2637745</td>\n",
" <td>406842</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>0.000000</td>\n",
" <td>2.0</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>2021-12-08 20:30:11+00:00</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320805</th>\n",
" <td>23334</td>\n",
" <td>22677</td>\n",
" <td>NaN</td>\n",
" <td>185203.0</td>\n",
" <td>4</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>13.0</td>\n",
" <td>13</td>\n",
" <td>11.692308</td>\n",
" <td>0.0</td>\n",
" <td>25.333333</td>\n",
" <td>2.166667</td>\n",
" <td>152.0</td>\n",
" <td>6</td>\n",
" <td>2018-05-02 07:47:40+00:00</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320806</th>\n",
" <td>2641373</td>\n",
" <td>408068</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>4</td>\n",
" <td>12.000000</td>\n",
" <td>0.0</td>\n",
" <td>48.000000</td>\n",
" <td>4.000000</td>\n",
" <td>48.0</td>\n",
" <td>1</td>\n",
" <td>2021-12-09 11:46:23+00:00</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320807</th>\n",
" <td>2641469</td>\n",
" <td>408160</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>1</td>\n",
" <td>12.000000</td>\n",
" <td>0.0</td>\n",
" <td>12.000000</td>\n",
" <td>1.000000</td>\n",
" <td>12.0</td>\n",
" <td>1</td>\n",
" <td>2021-12-09 18:50:55+00:00</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320808</th>\n",
" <td>2641474</td>\n",
" <td>408165</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>1</td>\n",
" <td>12.000000</td>\n",
" <td>0.0</td>\n",
" <td>12.000000</td>\n",
" <td>1.000000</td>\n",
" <td>12.0</td>\n",
" <td>1</td>\n",
" <td>2021-12-09 19:02:42+00:00</td>\n",
" <td>fr</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>320809 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 479734 3587 NaN 184801.0 0 \n",
"1 1537 1352 NaN NaN 0 \n",
"2 504615 3587 NaN 152176.0 0 \n",
"3 3832780 3587 NaN NaN 0 \n",
"4 3096540 3587 NaN NaN 0 \n",
"... ... ... ... ... ... \n",
"320804 2637745 406842 NaN NaN 1 \n",
"320805 23334 22677 NaN 185203.0 4 \n",
"320806 2641373 408068 NaN NaN 1 \n",
"320807 2641469 408160 NaN NaN 1 \n",
"320808 2641474 408165 NaN NaN 1 \n",
"\n",
" tenant_id is_partner deleted_at gender is_email_true ... \\\n",
"0 1342 False NaN 0 True ... \n",
"1 1342 False NaN 0 True ... \n",
"2 1342 False NaN 0 True ... \n",
"3 1342 False NaN 2 True ... \n",
"4 1342 False NaN 2 True ... \n",
"... ... ... ... ... ... ... \n",
"320804 1342 False NaN 0 True ... \n",
"320805 1342 False NaN 0 True ... \n",
"320806 1342 False NaN 0 True ... \n",
"320807 1342 False NaN 0 True ... \n",
"320808 1342 False NaN 0 True ... \n",
"\n",
" max_price ticket_sum average_price average_purchase_delay \\\n",
"0 NaN 0 NaN NaN \n",
"1 NaN 0 NaN NaN \n",
"2 NaN 0 NaN NaN \n",
"3 NaN 0 NaN NaN \n",
"4 NaN 0 NaN NaN \n",
"... ... ... ... ... \n",
"320804 0.0 2 0.000000 2.0 \n",
"320805 13.0 13 11.692308 0.0 \n",
"320806 12.0 4 12.000000 0.0 \n",
"320807 12.0 1 12.000000 0.0 \n",
"320808 12.0 1 12.000000 0.0 \n",
"\n",
" average_price_basket average_ticket_basket total_price \\\n",
"0 NaN NaN 0.0 \n",
"1 NaN NaN 0.0 \n",
"2 NaN NaN 0.0 \n",
"3 NaN NaN 0.0 \n",
"4 NaN NaN 0.0 \n",
"... ... ... ... \n",
"320804 0.000000 2.000000 0.0 \n",
"320805 25.333333 2.166667 152.0 \n",
"320806 48.000000 4.000000 48.0 \n",
"320807 12.000000 1.000000 12.0 \n",
"320808 12.000000 1.000000 12.0 \n",
"\n",
" purchase_count first_buying_date country \n",
"0 0 NaN fr \n",
"1 0 NaN fr \n",
"2 0 NaN fr \n",
"3 0 NaN fr \n",
"4 0 NaN fr \n",
"... ... ... ... \n",
"320804 1 2021-12-08 20:30:11+00:00 fr \n",
"320805 6 2018-05-02 07:47:40+00:00 fr \n",
"320806 1 2021-12-09 11:46:23+00:00 fr \n",
"320807 1 2021-12-09 18:50:55+00:00 fr \n",
"320808 1 2021-12-09 19:02:42+00:00 fr \n",
"\n",
"[320809 rows x 22 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"display_databases(\"4\", \"customerplus_cleaned\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option('display.max_rows')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "c897916c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>code</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101</td>\n",
" <td>hongrie</td>\n",
" <td>hu</td>\n",
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>albanie</td>\n",
" <td>al</td>\n",
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>antarctique</td>\n",
" <td>aq</td>\n",
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>12</td>\n",
" <td>autriche</td>\n",
" <td>at</td>\n",
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>samoa américaines</td>\n",
" <td>as</td>\n",
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238</th>\n",
" <td>228</td>\n",
" <td>royaume-uni</td>\n",
" <td>gb</td>\n",
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>239</th>\n",
" <td>25</td>\n",
" <td>brésil</td>\n",
" <td>br</td>\n",
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240</th>\n",
" <td>10</td>\n",
" <td>argentine</td>\n",
" <td>ar</td>\n",
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>241</th>\n",
" <td>203</td>\n",
" <td>espagne</td>\n",
" <td>es</td>\n",
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>242</th>\n",
" <td>192</td>\n",
" <td>arabie saoudite</td>\n",
" <td>sa</td>\n",
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>243 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" id name code created_at \\\n",
"0 101 hongrie hu 2023-06-13 11:17:40.600622+02:00 \n",
"1 2 albanie al 2023-06-13 11:17:40.540652+02:00 \n",
"2 3 antarctique aq 2023-06-13 11:17:40.541315+02:00 \n",
"3 12 autriche at 2023-06-13 11:17:40.546711+02:00 \n",
"4 5 samoa américaines as 2023-06-13 11:17:40.542569+02:00 \n",
".. ... ... ... ... \n",
"238 228 royaume-uni gb 2023-06-13 11:17:40.678023+02:00 \n",
"239 25 brésil br 2023-06-13 11:17:40.554209+02:00 \n",
"240 10 argentine ar 2023-06-13 11:17:40.545489+02:00 \n",
"241 203 espagne es 2023-06-13 11:17:40.662472+02:00 \n",
"242 192 arabie saoudite sa 2023-06-13 11:17:40.656154+02:00 \n",
"\n",
" updated_at \n",
"0 2023-06-13 11:17:40.600622+02:00 \n",
"1 2023-06-13 11:17:40.540652+02:00 \n",
"2 2023-06-13 11:17:40.541315+02:00 \n",
"3 2023-06-13 11:17:40.546711+02:00 \n",
"4 2023-06-13 11:17:40.542569+02:00 \n",
".. ... \n",
"238 2023-06-13 11:17:40.678023+02:00 \n",
"239 2023-06-13 11:17:40.554209+02:00 \n",
"240 2023-06-13 11:17:40.545489+02:00 \n",
"241 2023-06-13 11:17:40.662472+02:00 \n",
"242 2023-06-13 11:17:40.656154+02:00 \n",
"\n",
"[243 rows x 5 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"load_dataset_2(\"7\", \"countries\")"
]
},
{
"cell_type": "markdown",
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Base communes au types Musée"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
"metadata": {},
"outputs": [],
"source": [
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
"metadata": {},
"outputs": [],
"source": [
"companies['musee']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
"metadata": {},
"outputs": [],
"source": [
"for i in companies['musique']:\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" liste_base = []\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" liste_base.append(nom_base)\n",
" globals()['base_'+i] = liste_base\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
"metadata": {},
"outputs": [],
"source": [
"len(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
"metadata": {},
"outputs": [],
"source": [
"df1_tags = load_dataset_2(\"1\", \"tags\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa441f99-733c-4675-8676-bed4682d3324",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125e9145-a815-46fd-bdf4-07589508b259",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
"metadata": {},
"outputs": [],
"source": [
"len(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
"metadata": {},
"outputs": [],
"source": [
"?np.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
"metadata": {},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"print(df1_tags['name'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(df1_tags['name'])"
]
},
{
"cell_type": "markdown",
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI sur target_type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d91d5895",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n"
]
},
{
"cell_type": "markdown",
"id": "c58b17d3",
"metadata": {},
"source": [
"Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d74426b3",
"metadata": {},
"outputs": [],
"source": [
"targets = load_dataset_2(\"3\", \"targets\")\n",
"target_types = load_dataset_2(\"3\", \"target_types\")\n",
"\n",
"# target_all = pd.merge(targets, target_types, left_on= 'target_type_id', right_on= 'id' ,how = 'inner')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6930bff5",
"metadata": {},
"outputs": [],
"source": [
"def print_main_target(tenant_id, nb_print = 40):\n",
" df_target = display_databases(tenant_id, \"target_information\")\n",
"\n",
" print('Nombre de ciblage : ', len(df_target))\n",
" nb_customers = df_target['customer_id'].nunique()\n",
" print('Nombre de client avec étiquette target : ', nb_customers) \n",
"\n",
" nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
" nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
" nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
"\n",
" return nb_custumers_per_target.head(nb_print)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e7ee1a0",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b57a28ac",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('2', 25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a65991f",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('3', 40)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c66a4dc1",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f34b8bf",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('4', 80)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40fe3676",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('101', 100)"
]
},
{
"cell_type": "markdown",
"id": "605cced5-052f-4a99-ac26-020c5d2ab633",
"metadata": {},
"source": [
"## KPI sur tags"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
"metadata": {},
"outputs": [],
"source": [
"customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "46847b24-15a4-464e-969f-f16ed3653f1f",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3c10c69d-735f-453e-96bf-750697d965d0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19427"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "9b0e77b3-5f16-4484-9564-7d3826583418",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"33645"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(customersplus[customersplus['structure_id'].notna()])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3431"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"structure_tag_mappings['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
"metadata": {},
"outputs": [],
"source": [
"def tags_information(tenant_id, first_tags):\n",
"\n",
" customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
" customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
" tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
" tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
" structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
" \n",
" customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
" customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
" \n",
" nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
" \n",
" print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
" print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
" print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
" \n",
" info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
"\n",
" return info"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 13320\n",
"Proportion de clients avec tags : 0.0877089012682233\n",
"Moyenne de tags par client : 2.1725975975975977\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11029.0</td>\n",
" <td>individuels</td>\n",
" <td>3270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>11047.0</td>\n",
" <td>groupes scolaires</td>\n",
" <td>2417</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11033.0</td>\n",
" <td>association</td>\n",
" <td>2308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11028.0</td>\n",
" <td>structures culturelles</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>11051.0</td>\n",
" <td>etablissement ens scolaire</td>\n",
" <td>1732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>11036.0</td>\n",
" <td>champ social</td>\n",
" <td>1603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>11072.0</td>\n",
" <td>etab d'enseignement</td>\n",
" <td>1036</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>11043.0</td>\n",
" <td>etablissement public</td>\n",
" <td>935</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11035.0</td>\n",
" <td>organisme de tourisme</td>\n",
" <td>892</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>11045.0</td>\n",
" <td>centre de loisirs</td>\n",
" <td>864</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>11073.0</td>\n",
" <td>musée, site &amp; fondation</td>\n",
" <td>786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>11053.0</td>\n",
" <td>groupes etudiants</td>\n",
" <td>758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11032.0</td>\n",
" <td>entreprise</td>\n",
" <td>750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11039.0</td>\n",
" <td>etablissement d'enseignement</td>\n",
" <td>741</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>11034.0</td>\n",
" <td>asso. culturelle</td>\n",
" <td>692</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>11044.0</td>\n",
" <td>administration et collectivité</td>\n",
" <td>676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>11046.0</td>\n",
" <td>tour opérateur</td>\n",
" <td>642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>11048.0</td>\n",
" <td>entreprises</td>\n",
" <td>515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>11619.0</td>\n",
" <td>structures culturelles;musée, site &amp; fondation</td>\n",
" <td>427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>11037.0</td>\n",
" <td>handicap</td>\n",
" <td>426</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"1 11029.0 individuels 3270\n",
"18 11047.0 groupes scolaires 2417\n",
"4 11033.0 association 2308\n",
"0 11028.0 structures culturelles 2011\n",
"22 11051.0 etablissement ens scolaire 1732\n",
"7 11036.0 champ social 1603\n",
"43 11072.0 etab d'enseignement 1036\n",
"14 11043.0 etablissement public 935\n",
"6 11035.0 organisme de tourisme 892\n",
"16 11045.0 centre de loisirs 864\n",
"44 11073.0 musée, site & fondation 786\n",
"24 11053.0 groupes etudiants 758\n",
"3 11032.0 entreprise 750\n",
"10 11039.0 etablissement d'enseignement 741\n",
"5 11034.0 asso. culturelle 692\n",
"15 11044.0 administration et collectivité 676\n",
"17 11046.0 tour opérateur 642\n",
"19 11048.0 entreprises 515\n",
"72 11619.0 structures culturelles;musée, site & fondation 427\n",
"8 11037.0 handicap 426"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"1\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 5953\n",
"Proportion de clients avec tags : 0.021598421025897787\n",
"Moyenne de tags par client : 1.0\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>training-sb-ax</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"0 1.0 training-sb-ax 5"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"2\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>training-sb-ax</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name\n",
"0 1 training-sb-ax\n",
"1 2 NaN"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 23659\n",
"Proportion de clients avec tags : 0.09207484608139978\n",
"Moyenne de tags par client : 3.0620482691576143\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>164</th>\n",
" <td>44539.0</td>\n",
" <td>*individuel/particulier</td>\n",
" <td>13148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>26926.0</td>\n",
" <td>ce</td>\n",
" <td>3216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>6995.0</td>\n",
" <td>college</td>\n",
" <td>2126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>7028.0</td>\n",
" <td>lycee</td>\n",
" <td>1577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>154</th>\n",
" <td>44524.0</td>\n",
" <td>iraiser</td>\n",
" <td>1453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6714.0</td>\n",
" <td>ecole primaire</td>\n",
" <td>1200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>44525.0</td>\n",
" <td>bp</td>\n",
" <td>1094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>7024.0</td>\n",
" <td>centre de loisirs</td>\n",
" <td>1080</td>\n",
" </tr>\n",
" <tr>\n",
" <th>153</th>\n",
" <td>44515.0</td>\n",
" <td>entreprise</td>\n",
" <td>998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>44039.0</td>\n",
" <td>ca fondation d'aumale</td>\n",
" <td>891</td>\n",
" </tr>\n",
" <tr>\n",
" <th>152</th>\n",
" <td>44514.0</td>\n",
" <td>particulier</td>\n",
" <td>838</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>43663.0</td>\n",
" <td>président</td>\n",
" <td>816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>43703.0</td>\n",
" <td>directeur</td>\n",
" <td>812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>44528.0</td>\n",
" <td>dc</td>\n",
" <td>807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>43681.0</td>\n",
" <td>présidente</td>\n",
" <td>805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>44511.0</td>\n",
" <td>entreprise (financier)</td>\n",
" <td>805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>43718.0</td>\n",
" <td>conseillère régionale déléguée titulaire</td>\n",
" <td>804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>43667.0</td>\n",
" <td>directeur de l'agence</td>\n",
" <td>801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>43705.0</td>\n",
" <td>sous-préfet</td>\n",
" <td>798</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>43728.0</td>\n",
" <td>chargée de mission paysage</td>\n",
" <td>797</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"164 44539.0 *individuel/particulier 13148\n",
"30 26926.0 ce 3216\n",
"14 6995.0 college 2126\n",
"16 7028.0 lycee 1577\n",
"154 44524.0 iraiser 1453\n",
"2 6714.0 ecole primaire 1200\n",
"155 44525.0 bp 1094\n",
"15 7024.0 centre de loisirs 1080\n",
"153 44515.0 entreprise 998\n",
"126 44039.0 ca fondation d'aumale 891\n",
"152 44514.0 particulier 838\n",
"36 43663.0 président 816\n",
"76 43703.0 directeur 812\n",
"158 44528.0 dc 807\n",
"54 43681.0 présidente 805\n",
"149 44511.0 entreprise (financier) 805\n",
"90 43718.0 conseillère régionale déléguée titulaire 804\n",
"40 43667.0 directeur de l'agence 801\n",
"78 43705.0 sous-préfet 798\n",
"100 43728.0 chargée de mission paysage 797"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"3\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "76639995-252d-4a58-83d8-c0c00900c3a9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 10495\n",
"Proportion de clients avec tags : 0.03271416949025744\n",
"Moyenne de tags par client : 5.298427822772749\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>298.0</td>\n",
" <td>jhima</td>\n",
" <td>4219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>297.0</td>\n",
" <td>colloque algérie</td>\n",
" <td>3851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>292.0</td>\n",
" <td>i&amp;ma</td>\n",
" <td>3826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>154</th>\n",
" <td>305.0</td>\n",
" <td>mardis de la philo</td>\n",
" <td>3674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>301.0</td>\n",
" <td>le grand continant</td>\n",
" <td>3670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>295.0</td>\n",
" <td>araborama</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>306.0</td>\n",
" <td>marie descourtieux</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>296.0</td>\n",
" <td>c'était la guerre d'algérie</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>291.0</td>\n",
" <td>araborama 3</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>198.0</td>\n",
" <td>association de collectivités territoriales spé...</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>294.0</td>\n",
" <td>arabofolies</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>199.0</td>\n",
" <td>rassemble les 11 000 élus de toute la france a...</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>50.0</td>\n",
" <td>association</td>\n",
" <td>463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>54.0</td>\n",
" <td>collège</td>\n",
" <td>446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49.0</td>\n",
" <td>ecole</td>\n",
" <td>374</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>55.0</td>\n",
" <td>lycée</td>\n",
" <td>275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>53.0</td>\n",
" <td>centre social</td>\n",
" <td>200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>130.0</td>\n",
" <td>cultures et arts</td>\n",
" <td>141</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>51.0</td>\n",
" <td>mairie</td>\n",
" <td>136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>64.0</td>\n",
" <td>formation_ima_ax</td>\n",
" <td>87</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"147 298.0 jhima 4219\n",
"146 297.0 colloque algérie 3851\n",
"142 292.0 i&ma 3826\n",
"154 305.0 mardis de la philo 3674\n",
"150 301.0 le grand continant 3670\n",
"144 295.0 araborama 3669\n",
"155 306.0 marie descourtieux 3669\n",
"145 296.0 c'était la guerre d'algérie 3669\n",
"141 291.0 araborama 3 3669\n",
"102 198.0 association de collectivités territoriales spé... 3669\n",
"143 294.0 arabofolies 3669\n",
"103 199.0 rassemble les 11 000 élus de toute la france a... 3669\n",
"2 50.0 association 463\n",
"6 54.0 collège 446\n",
"1 49.0 ecole 374\n",
"7 55.0 lycée 275\n",
"5 53.0 centre social 200\n",
"53 130.0 cultures et arts 141\n",
"3 51.0 mairie 136\n",
"13 64.0 formation_ima_ax 87"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"4\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 532342\n",
"Proportion de clients avec tags : 0.18660686931118298\n",
"Moyenne de tags par client : 24.114082676174338\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>349.0</td>\n",
" <td>clients internet</td>\n",
" <td>517491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>356.0</td>\n",
" <td>associations / clubs</td>\n",
" <td>495520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>10.0</td>\n",
" <td>agence de voyages</td>\n",
" <td>493774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>410.0</td>\n",
" <td>guides conférenciers</td>\n",
" <td>493378</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>360.0</td>\n",
" <td>groupe amis ou famille</td>\n",
" <td>493021</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>354.0</td>\n",
" <td>ce / entreprises</td>\n",
" <td>493016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>17.0</td>\n",
" <td>association/club</td>\n",
" <td>493008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" <td>c.e. / entreprise</td>\n",
" <td>492656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11.0</td>\n",
" <td>college</td>\n",
" <td>492552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>69.0</td>\n",
" <td>tour operator</td>\n",
" <td>492549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9.0</td>\n",
" <td>ecole primaire</td>\n",
" <td>492540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>379.0</td>\n",
" <td>parent goûter anniversaire</td>\n",
" <td>492468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>364.0</td>\n",
" <td>institutions</td>\n",
" <td>492364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6.0</td>\n",
" <td>institution</td>\n",
" <td>492321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>186.0</td>\n",
" <td>autocaristes</td>\n",
" <td>492153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>13.0</td>\n",
" <td>enseignement superieur</td>\n",
" <td>492131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>359.0</td>\n",
" <td>hotels / campings</td>\n",
" <td>492078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>7186.0</td>\n",
" <td>individuel</td>\n",
" <td>491913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.0</td>\n",
" <td>groupe amis / famille</td>\n",
" <td>491900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>client internet</td>\n",
" <td>491896</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"20 349.0 clients internet 517491\n",
"24 356.0 associations / clubs 495520\n",
"5 10.0 agence de voyages 493774\n",
"32 410.0 guides conférenciers 493378\n",
"26 360.0 groupe amis ou famille 493021\n",
"23 354.0 ce / entreprises 493016\n",
"8 17.0 association/club 493008\n",
"1 3.0 c.e. / entreprise 492656\n",
"6 11.0 college 492552\n",
"13 69.0 tour operator 492549\n",
"4 9.0 ecole primaire 492540\n",
"31 379.0 parent goûter anniversaire 492468\n",
"30 364.0 institutions 492364\n",
"2 6.0 institution 492321\n",
"18 186.0 autocaristes 492153\n",
"7 13.0 enseignement superieur 492131\n",
"25 359.0 hotels / campings 492078\n",
"42 7186.0 individuel 491913\n",
"3 7.0 groupe amis / famille 491900\n",
"0 2.0 client internet 491896"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"101\", 20)"
]
},
{
"cell_type": "markdown",
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
"metadata": {},
"source": [
"# Fin travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "c437eaec",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "a1c1fc39",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "66f8c17b",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c08e6798",
"metadata": {},
"outputs": [],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "675f518d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<<<<<<< local <modified: >\n",
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n",
"=======\n",
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n",
">>>>>>> remote <modified: >\n"
]
}
],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "e855f403",
"metadata": {},
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "91a8f8c4",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "2fda171d",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "205eeeab",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "634282c5",
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "0e8d4133",
"metadata": {},
"outputs": [],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1268ad5a",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "bd41dc80",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "64d0f76b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7e683711",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e7b9a52e",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "568280e8",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "29ecec90",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "22bb5de4",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6a9a91f4",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bab4758a",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b5fff251",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8b09e2a3",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ecee7cdc",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
"id": "c8e6e69b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1a6cff1f",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "93630b41",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4f94481a",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "1b2811e2",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2455d2e1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5f9a159d",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "bd436fca",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "83435862",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
"id": "f210e730",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "83a4d021",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "56e6ebd1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "88fcde4b",
"metadata": {},
"outputs": [],
"source": [
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
"id": "818f69db",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c9654eda",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7f2b620c",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
"id": "637bdb72",
"metadata": {},
"source": [
"# Customer information"
]
},
{
"cell_type": "markdown",
"id": "14c52894",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d83abfbf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "90d71b2c",
"metadata": {},
"outputs": [],
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2301de1e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
"\n",
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "75fbc2f7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "55cddf92",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
]
}
],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7fd98a85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
]
}
],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "cf94bb1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3fc1f446",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
"id": "20e69ee3",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "d9cbdbce",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "c07459f0",
"metadata": {},
"outputs": [],
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
"id": "80ae4c42",
"metadata": {},
"source": [
"## Exploration variables"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b50b8f95",
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7e292935",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "05b6f2b0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "10304058",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
" client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bdc88d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
"id": "6a0f567d",
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
"\n",
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "1522d8cd",
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "b0e42a61",
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "d299ae91",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tenant_id\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"5 1\n",
"6 1\n",
"7 1\n",
"8 1\n",
"9 1\n",
"10 1\n",
"11 1\n",
"12 1\n",
"13 1\n",
"14 1\n",
"101 1\n",
"Name: canal_vente_internet, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}