3362 lines
100 KiB
Plaintext
3362 lines
100 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "5bf5c226",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Business Data Challenge - Team 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "b1a5b9d3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"import re\n",
|
||
"import warnings"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ecfa2219",
|
||
"metadata": {},
|
||
"source": [
|
||
"Configuration de l'accès aux données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "1a094277",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Debut Travail 25/02"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
||
"\n",
|
||
"\n",
|
||
"# Import cleaning and merge functions\n",
|
||
"exec(open('0_KPI_functions.py').read())\n",
|
||
"\n",
|
||
"# Ignore warning\n",
|
||
"warnings.filterwarnings('ignore')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def load_dataset_2(directory_path, file_name):\n",
|
||
" \"\"\"\n",
|
||
" This function loads csv file\n",
|
||
" \"\"\"\n",
|
||
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
|
||
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in, sep=\",\")\n",
|
||
"\n",
|
||
" # drop na :\n",
|
||
" #df = df.dropna(axis=1, thresh=len(df))\n",
|
||
" # if identifier in table : delete it\n",
|
||
" if 'identifier' in df.columns:\n",
|
||
" df = df.drop(columns = 'identifier')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"File path : projet-bdc2324-team1/0_Input/Company_4/customerplus_cleaned.csv\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>street_id</th>\n",
|
||
" <th>structure_id</th>\n",
|
||
" <th>mcp_contact_id</th>\n",
|
||
" <th>fidelity</th>\n",
|
||
" <th>tenant_id</th>\n",
|
||
" <th>is_partner</th>\n",
|
||
" <th>deleted_at</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>max_price</th>\n",
|
||
" <th>ticket_sum</th>\n",
|
||
" <th>average_price</th>\n",
|
||
" <th>average_purchase_delay</th>\n",
|
||
" <th>average_price_basket</th>\n",
|
||
" <th>average_ticket_basket</th>\n",
|
||
" <th>total_price</th>\n",
|
||
" <th>purchase_count</th>\n",
|
||
" <th>first_buying_date</th>\n",
|
||
" <th>country</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>479734</td>\n",
|
||
" <td>3587</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>184801.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1537</td>\n",
|
||
" <td>1352</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>504615</td>\n",
|
||
" <td>3587</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>152176.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3832780</td>\n",
|
||
" <td>3587</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>3096540</td>\n",
|
||
" <td>3587</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>320804</th>\n",
|
||
" <td>2637745</td>\n",
|
||
" <td>406842</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-08 20:30:11+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>320805</th>\n",
|
||
" <td>23334</td>\n",
|
||
" <td>22677</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>185203.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>11.692308</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>25.333333</td>\n",
|
||
" <td>2.166667</td>\n",
|
||
" <td>152.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-05-02 07:47:40+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>320806</th>\n",
|
||
" <td>2641373</td>\n",
|
||
" <td>408068</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>48.000000</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>48.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-09 11:46:23+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>320807</th>\n",
|
||
" <td>2641469</td>\n",
|
||
" <td>408160</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-09 18:50:55+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>320808</th>\n",
|
||
" <td>2641474</td>\n",
|
||
" <td>408165</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1342</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2021-12-09 19:02:42+00:00</td>\n",
|
||
" <td>fr</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>320809 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
|
||
"0 479734 3587 NaN 184801.0 0 \n",
|
||
"1 1537 1352 NaN NaN 0 \n",
|
||
"2 504615 3587 NaN 152176.0 0 \n",
|
||
"3 3832780 3587 NaN NaN 0 \n",
|
||
"4 3096540 3587 NaN NaN 0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"320804 2637745 406842 NaN NaN 1 \n",
|
||
"320805 23334 22677 NaN 185203.0 4 \n",
|
||
"320806 2641373 408068 NaN NaN 1 \n",
|
||
"320807 2641469 408160 NaN NaN 1 \n",
|
||
"320808 2641474 408165 NaN NaN 1 \n",
|
||
"\n",
|
||
" tenant_id is_partner deleted_at gender is_email_true ... \\\n",
|
||
"0 1342 False NaN 0 True ... \n",
|
||
"1 1342 False NaN 0 True ... \n",
|
||
"2 1342 False NaN 0 True ... \n",
|
||
"3 1342 False NaN 2 True ... \n",
|
||
"4 1342 False NaN 2 True ... \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"320804 1342 False NaN 0 True ... \n",
|
||
"320805 1342 False NaN 0 True ... \n",
|
||
"320806 1342 False NaN 0 True ... \n",
|
||
"320807 1342 False NaN 0 True ... \n",
|
||
"320808 1342 False NaN 0 True ... \n",
|
||
"\n",
|
||
" max_price ticket_sum average_price average_purchase_delay \\\n",
|
||
"0 NaN 0 NaN NaN \n",
|
||
"1 NaN 0 NaN NaN \n",
|
||
"2 NaN 0 NaN NaN \n",
|
||
"3 NaN 0 NaN NaN \n",
|
||
"4 NaN 0 NaN NaN \n",
|
||
"... ... ... ... ... \n",
|
||
"320804 0.0 2 0.000000 2.0 \n",
|
||
"320805 13.0 13 11.692308 0.0 \n",
|
||
"320806 12.0 4 12.000000 0.0 \n",
|
||
"320807 12.0 1 12.000000 0.0 \n",
|
||
"320808 12.0 1 12.000000 0.0 \n",
|
||
"\n",
|
||
" average_price_basket average_ticket_basket total_price \\\n",
|
||
"0 NaN NaN 0.0 \n",
|
||
"1 NaN NaN 0.0 \n",
|
||
"2 NaN NaN 0.0 \n",
|
||
"3 NaN NaN 0.0 \n",
|
||
"4 NaN NaN 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"320804 0.000000 2.000000 0.0 \n",
|
||
"320805 25.333333 2.166667 152.0 \n",
|
||
"320806 48.000000 4.000000 48.0 \n",
|
||
"320807 12.000000 1.000000 12.0 \n",
|
||
"320808 12.000000 1.000000 12.0 \n",
|
||
"\n",
|
||
" purchase_count first_buying_date country \n",
|
||
"0 0 NaN fr \n",
|
||
"1 0 NaN fr \n",
|
||
"2 0 NaN fr \n",
|
||
"3 0 NaN fr \n",
|
||
"4 0 NaN fr \n",
|
||
"... ... ... ... \n",
|
||
"320804 1 2021-12-08 20:30:11+00:00 fr \n",
|
||
"320805 6 2018-05-02 07:47:40+00:00 fr \n",
|
||
"320806 1 2021-12-09 11:46:23+00:00 fr \n",
|
||
"320807 1 2021-12-09 18:50:55+00:00 fr \n",
|
||
"320808 1 2021-12-09 19:02:42+00:00 fr \n",
|
||
"\n",
|
||
"[320809 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"display_databases(\"4\", \"customerplus_cleaned\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.reset_option('display.max_rows')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "c897916c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>code</th>\n",
|
||
" <th>created_at</th>\n",
|
||
" <th>updated_at</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>101</td>\n",
|
||
" <td>hongrie</td>\n",
|
||
" <td>hu</td>\n",
|
||
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.600622+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>albanie</td>\n",
|
||
" <td>al</td>\n",
|
||
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.540652+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>antarctique</td>\n",
|
||
" <td>aq</td>\n",
|
||
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.541315+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>autriche</td>\n",
|
||
" <td>at</td>\n",
|
||
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.546711+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>samoa américaines</td>\n",
|
||
" <td>as</td>\n",
|
||
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.542569+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>238</th>\n",
|
||
" <td>228</td>\n",
|
||
" <td>royaume-uni</td>\n",
|
||
" <td>gb</td>\n",
|
||
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.678023+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>239</th>\n",
|
||
" <td>25</td>\n",
|
||
" <td>brésil</td>\n",
|
||
" <td>br</td>\n",
|
||
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.554209+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>240</th>\n",
|
||
" <td>10</td>\n",
|
||
" <td>argentine</td>\n",
|
||
" <td>ar</td>\n",
|
||
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.545489+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>241</th>\n",
|
||
" <td>203</td>\n",
|
||
" <td>espagne</td>\n",
|
||
" <td>es</td>\n",
|
||
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.662472+02:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>242</th>\n",
|
||
" <td>192</td>\n",
|
||
" <td>arabie saoudite</td>\n",
|
||
" <td>sa</td>\n",
|
||
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
|
||
" <td>2023-06-13 11:17:40.656154+02:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>243 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name code created_at \\\n",
|
||
"0 101 hongrie hu 2023-06-13 11:17:40.600622+02:00 \n",
|
||
"1 2 albanie al 2023-06-13 11:17:40.540652+02:00 \n",
|
||
"2 3 antarctique aq 2023-06-13 11:17:40.541315+02:00 \n",
|
||
"3 12 autriche at 2023-06-13 11:17:40.546711+02:00 \n",
|
||
"4 5 samoa américaines as 2023-06-13 11:17:40.542569+02:00 \n",
|
||
".. ... ... ... ... \n",
|
||
"238 228 royaume-uni gb 2023-06-13 11:17:40.678023+02:00 \n",
|
||
"239 25 brésil br 2023-06-13 11:17:40.554209+02:00 \n",
|
||
"240 10 argentine ar 2023-06-13 11:17:40.545489+02:00 \n",
|
||
"241 203 espagne es 2023-06-13 11:17:40.662472+02:00 \n",
|
||
"242 192 arabie saoudite sa 2023-06-13 11:17:40.656154+02:00 \n",
|
||
"\n",
|
||
" updated_at \n",
|
||
"0 2023-06-13 11:17:40.600622+02:00 \n",
|
||
"1 2023-06-13 11:17:40.540652+02:00 \n",
|
||
"2 2023-06-13 11:17:40.541315+02:00 \n",
|
||
"3 2023-06-13 11:17:40.546711+02:00 \n",
|
||
"4 2023-06-13 11:17:40.542569+02:00 \n",
|
||
".. ... \n",
|
||
"238 2023-06-13 11:17:40.678023+02:00 \n",
|
||
"239 2023-06-13 11:17:40.554209+02:00 \n",
|
||
"240 2023-06-13 11:17:40.545489+02:00 \n",
|
||
"241 2023-06-13 11:17:40.662472+02:00 \n",
|
||
"242 2023-06-13 11:17:40.656154+02:00 \n",
|
||
"\n",
|
||
"[243 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"load_dataset_2(\"7\", \"countries\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## Base communes au types Musée"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
|
||
" 'sport': ['5', '6', '7', '8', '9'],\n",
|
||
" 'musique' : ['10', '11', '12', '13', '14']}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"companies['musee']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"for i in companies['musique']:\n",
|
||
" BUCKET = \"bdc2324-data/\"+i\n",
|
||
" liste_base = []\n",
|
||
" for base in fs.ls(BUCKET):\n",
|
||
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
|
||
" if match:\n",
|
||
" nom_base = match.group(3)\n",
|
||
" liste_base.append(nom_base)\n",
|
||
" globals()['base_'+i] = liste_base\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Trouver l'intersection entre les cinq listes\n",
|
||
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
|
||
"\n",
|
||
"# Convertir le résultat en liste si nécessaire\n",
|
||
"intersection_liste = list(intersection)\n",
|
||
"\n",
|
||
"print(intersection_liste)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Trouver l'intersection entre les cinq listes\n",
|
||
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
|
||
"\n",
|
||
"# Convertir le résultat en liste si nécessaire\n",
|
||
"intersection_liste = list(intersection)\n",
|
||
"\n",
|
||
"print(intersection_liste)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"len(intersection_liste)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tags = load_dataset_2(\"1\", \"tags\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "aa441f99-733c-4675-8676-bed4682d3324",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "125e9145-a815-46fd-bdf4-07589508b259",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"len(df1_structure_tag_mappings)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"?np.sort_values()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def info_colonnes_dataframe(df):\n",
|
||
" # Créer une liste pour stocker les informations sur chaque colonne\n",
|
||
" infos_colonnes = []\n",
|
||
"\n",
|
||
" # Parcourir les colonnes du DataFrame\n",
|
||
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
|
||
" # Calculer le taux de valeurs manquantes\n",
|
||
" taux_na = serie.isna().mean() * 100\n",
|
||
"\n",
|
||
" # Ajouter les informations à la liste\n",
|
||
" infos_colonnes.append({\n",
|
||
" 'Nom_colonne': nom_colonne,\n",
|
||
" 'Type_colonne': str(serie.dtype),\n",
|
||
" 'Taux_NA': taux_na\n",
|
||
" })\n",
|
||
"\n",
|
||
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
|
||
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
|
||
"\n",
|
||
" return df_infos_colonnes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"info_colonnes_dataframe(df1_tags)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"info_colonnes_dataframe(df1_structure_tag_mappings)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.set_option('display.max_colwidth', None)\n",
|
||
"\n",
|
||
"print(df1_tags['name'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.set_option('display.max_rows', None)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(df1_tags['name'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## KPI sur target_type"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d91d5895",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.set_option('display.max_colwidth', None)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c58b17d3",
|
||
"metadata": {},
|
||
"source": [
|
||
"Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d74426b3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"targets = load_dataset_2(\"3\", \"targets\")\n",
|
||
"target_types = load_dataset_2(\"3\", \"target_types\")\n",
|
||
"\n",
|
||
"# target_all = pd.merge(targets, target_types, left_on= 'target_type_id', right_on= 'id' ,how = 'inner')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6930bff5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def print_main_target(tenant_id, nb_print = 40):\n",
|
||
" df_target = display_databases(tenant_id, \"target_information\")\n",
|
||
"\n",
|
||
" print('Nombre de ciblage : ', len(df_target))\n",
|
||
" nb_customers = df_target['customer_id'].nunique()\n",
|
||
" print('Nombre de client avec étiquette target : ', nb_customers) \n",
|
||
"\n",
|
||
" nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
|
||
" nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
|
||
" nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
|
||
"\n",
|
||
" return nb_custumers_per_target.head(nb_print)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1e7ee1a0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print_main_target('1')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b57a28ac",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print_main_target('2', 25)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9a65991f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print_main_target('3', 40)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c66a4dc1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.set_option('display.max_rows', None)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5f34b8bf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print_main_target('4', 80)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "40fe3676",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print_main_target('101', 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "605cced5-052f-4a99-ac26-020c5d2ab633",
|
||
"metadata": {},
|
||
"source": [
|
||
"## KPI sur tags"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "46847b24-15a4-464e-969f-f16ed3653f1f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "3c10c69d-735f-453e-96bf-750697d965d0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"19427"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "9b0e77b3-5f16-4484-9564-7d3826583418",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"33645"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(customersplus[customersplus['structure_id'].notna()])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"3431"
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"structure_tag_mappings['structure_id'].nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def tags_information(tenant_id, first_tags):\n",
|
||
"\n",
|
||
" customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
|
||
" customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
||
" tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
|
||
" tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
|
||
" structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
|
||
" \n",
|
||
" customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
|
||
" customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
|
||
" \n",
|
||
" nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
|
||
" \n",
|
||
" print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
|
||
" print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
|
||
" print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
|
||
" \n",
|
||
" info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
|
||
"\n",
|
||
" return info"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de client avec tag : 13320\n",
|
||
"Proportion de clients avec tags : 0.0877089012682233\n",
|
||
"Moyenne de tags par client : 2.1725975975975977\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>tag_name</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>11029.0</td>\n",
|
||
" <td>individuels</td>\n",
|
||
" <td>3270</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>11047.0</td>\n",
|
||
" <td>groupes scolaires</td>\n",
|
||
" <td>2417</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>11033.0</td>\n",
|
||
" <td>association</td>\n",
|
||
" <td>2308</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>11028.0</td>\n",
|
||
" <td>structures culturelles</td>\n",
|
||
" <td>2011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>11051.0</td>\n",
|
||
" <td>etablissement ens scolaire</td>\n",
|
||
" <td>1732</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>11036.0</td>\n",
|
||
" <td>champ social</td>\n",
|
||
" <td>1603</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>43</th>\n",
|
||
" <td>11072.0</td>\n",
|
||
" <td>etab d'enseignement</td>\n",
|
||
" <td>1036</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>11043.0</td>\n",
|
||
" <td>etablissement public</td>\n",
|
||
" <td>935</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>11035.0</td>\n",
|
||
" <td>organisme de tourisme</td>\n",
|
||
" <td>892</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>11045.0</td>\n",
|
||
" <td>centre de loisirs</td>\n",
|
||
" <td>864</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>44</th>\n",
|
||
" <td>11073.0</td>\n",
|
||
" <td>musée, site & fondation</td>\n",
|
||
" <td>786</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>11053.0</td>\n",
|
||
" <td>groupes etudiants</td>\n",
|
||
" <td>758</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>11032.0</td>\n",
|
||
" <td>entreprise</td>\n",
|
||
" <td>750</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>11039.0</td>\n",
|
||
" <td>etablissement d'enseignement</td>\n",
|
||
" <td>741</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>11034.0</td>\n",
|
||
" <td>asso. culturelle</td>\n",
|
||
" <td>692</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>11044.0</td>\n",
|
||
" <td>administration et collectivité</td>\n",
|
||
" <td>676</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>11046.0</td>\n",
|
||
" <td>tour opérateur</td>\n",
|
||
" <td>642</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>11048.0</td>\n",
|
||
" <td>entreprises</td>\n",
|
||
" <td>515</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>72</th>\n",
|
||
" <td>11619.0</td>\n",
|
||
" <td>structures culturelles;musée, site & fondation</td>\n",
|
||
" <td>427</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>11037.0</td>\n",
|
||
" <td>handicap</td>\n",
|
||
" <td>426</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tag_id tag_name customer_id\n",
|
||
"1 11029.0 individuels 3270\n",
|
||
"18 11047.0 groupes scolaires 2417\n",
|
||
"4 11033.0 association 2308\n",
|
||
"0 11028.0 structures culturelles 2011\n",
|
||
"22 11051.0 etablissement ens scolaire 1732\n",
|
||
"7 11036.0 champ social 1603\n",
|
||
"43 11072.0 etab d'enseignement 1036\n",
|
||
"14 11043.0 etablissement public 935\n",
|
||
"6 11035.0 organisme de tourisme 892\n",
|
||
"16 11045.0 centre de loisirs 864\n",
|
||
"44 11073.0 musée, site & fondation 786\n",
|
||
"24 11053.0 groupes etudiants 758\n",
|
||
"3 11032.0 entreprise 750\n",
|
||
"10 11039.0 etablissement d'enseignement 741\n",
|
||
"5 11034.0 asso. culturelle 692\n",
|
||
"15 11044.0 administration et collectivité 676\n",
|
||
"17 11046.0 tour opérateur 642\n",
|
||
"19 11048.0 entreprises 515\n",
|
||
"72 11619.0 structures culturelles;musée, site & fondation 427\n",
|
||
"8 11037.0 handicap 426"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags_information(\"1\", 20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de client avec tag : 5953\n",
|
||
"Proportion de clients avec tags : 0.021598421025897787\n",
|
||
"Moyenne de tags par client : 1.0\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>tag_name</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>training-sb-ax</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tag_id tag_name customer_id\n",
|
||
"0 1.0 training-sb-ax 5"
|
||
]
|
||
},
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags_information(\"2\", 20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>training-sb-ax</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name\n",
|
||
"0 1 training-sb-ax\n",
|
||
"1 2 NaN"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de client avec tag : 23659\n",
|
||
"Proportion de clients avec tags : 0.09207484608139978\n",
|
||
"Moyenne de tags par client : 3.0620482691576143\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>tag_name</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>164</th>\n",
|
||
" <td>44539.0</td>\n",
|
||
" <td>*individuel/particulier</td>\n",
|
||
" <td>13148</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30</th>\n",
|
||
" <td>26926.0</td>\n",
|
||
" <td>ce</td>\n",
|
||
" <td>3216</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>6995.0</td>\n",
|
||
" <td>college</td>\n",
|
||
" <td>2126</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>7028.0</td>\n",
|
||
" <td>lycee</td>\n",
|
||
" <td>1577</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>154</th>\n",
|
||
" <td>44524.0</td>\n",
|
||
" <td>iraiser</td>\n",
|
||
" <td>1453</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>6714.0</td>\n",
|
||
" <td>ecole primaire</td>\n",
|
||
" <td>1200</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>155</th>\n",
|
||
" <td>44525.0</td>\n",
|
||
" <td>bp</td>\n",
|
||
" <td>1094</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>7024.0</td>\n",
|
||
" <td>centre de loisirs</td>\n",
|
||
" <td>1080</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>153</th>\n",
|
||
" <td>44515.0</td>\n",
|
||
" <td>entreprise</td>\n",
|
||
" <td>998</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>126</th>\n",
|
||
" <td>44039.0</td>\n",
|
||
" <td>ca fondation d'aumale</td>\n",
|
||
" <td>891</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>152</th>\n",
|
||
" <td>44514.0</td>\n",
|
||
" <td>particulier</td>\n",
|
||
" <td>838</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>36</th>\n",
|
||
" <td>43663.0</td>\n",
|
||
" <td>président</td>\n",
|
||
" <td>816</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>76</th>\n",
|
||
" <td>43703.0</td>\n",
|
||
" <td>directeur</td>\n",
|
||
" <td>812</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>158</th>\n",
|
||
" <td>44528.0</td>\n",
|
||
" <td>dc</td>\n",
|
||
" <td>807</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>54</th>\n",
|
||
" <td>43681.0</td>\n",
|
||
" <td>présidente</td>\n",
|
||
" <td>805</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>149</th>\n",
|
||
" <td>44511.0</td>\n",
|
||
" <td>entreprise (financier)</td>\n",
|
||
" <td>805</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>90</th>\n",
|
||
" <td>43718.0</td>\n",
|
||
" <td>conseillère régionale déléguée titulaire</td>\n",
|
||
" <td>804</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>40</th>\n",
|
||
" <td>43667.0</td>\n",
|
||
" <td>directeur de l'agence</td>\n",
|
||
" <td>801</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>78</th>\n",
|
||
" <td>43705.0</td>\n",
|
||
" <td>sous-préfet</td>\n",
|
||
" <td>798</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100</th>\n",
|
||
" <td>43728.0</td>\n",
|
||
" <td>chargée de mission paysage</td>\n",
|
||
" <td>797</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tag_id tag_name customer_id\n",
|
||
"164 44539.0 *individuel/particulier 13148\n",
|
||
"30 26926.0 ce 3216\n",
|
||
"14 6995.0 college 2126\n",
|
||
"16 7028.0 lycee 1577\n",
|
||
"154 44524.0 iraiser 1453\n",
|
||
"2 6714.0 ecole primaire 1200\n",
|
||
"155 44525.0 bp 1094\n",
|
||
"15 7024.0 centre de loisirs 1080\n",
|
||
"153 44515.0 entreprise 998\n",
|
||
"126 44039.0 ca fondation d'aumale 891\n",
|
||
"152 44514.0 particulier 838\n",
|
||
"36 43663.0 président 816\n",
|
||
"76 43703.0 directeur 812\n",
|
||
"158 44528.0 dc 807\n",
|
||
"54 43681.0 présidente 805\n",
|
||
"149 44511.0 entreprise (financier) 805\n",
|
||
"90 43718.0 conseillère régionale déléguée titulaire 804\n",
|
||
"40 43667.0 directeur de l'agence 801\n",
|
||
"78 43705.0 sous-préfet 798\n",
|
||
"100 43728.0 chargée de mission paysage 797"
|
||
]
|
||
},
|
||
"execution_count": 42,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags_information(\"3\", 20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "76639995-252d-4a58-83d8-c0c00900c3a9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de client avec tag : 10495\n",
|
||
"Proportion de clients avec tags : 0.03271416949025744\n",
|
||
"Moyenne de tags par client : 5.298427822772749\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>tag_name</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>147</th>\n",
|
||
" <td>298.0</td>\n",
|
||
" <td>jhima</td>\n",
|
||
" <td>4219</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>146</th>\n",
|
||
" <td>297.0</td>\n",
|
||
" <td>colloque algérie</td>\n",
|
||
" <td>3851</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>142</th>\n",
|
||
" <td>292.0</td>\n",
|
||
" <td>i&ma</td>\n",
|
||
" <td>3826</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>154</th>\n",
|
||
" <td>305.0</td>\n",
|
||
" <td>mardis de la philo</td>\n",
|
||
" <td>3674</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>150</th>\n",
|
||
" <td>301.0</td>\n",
|
||
" <td>le grand continant</td>\n",
|
||
" <td>3670</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>144</th>\n",
|
||
" <td>295.0</td>\n",
|
||
" <td>araborama</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>155</th>\n",
|
||
" <td>306.0</td>\n",
|
||
" <td>marie descourtieux</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>145</th>\n",
|
||
" <td>296.0</td>\n",
|
||
" <td>c'était la guerre d'algérie</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>141</th>\n",
|
||
" <td>291.0</td>\n",
|
||
" <td>araborama 3</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>102</th>\n",
|
||
" <td>198.0</td>\n",
|
||
" <td>association de collectivités territoriales spé...</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>143</th>\n",
|
||
" <td>294.0</td>\n",
|
||
" <td>arabofolies</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>103</th>\n",
|
||
" <td>199.0</td>\n",
|
||
" <td>rassemble les 11 000 élus de toute la france a...</td>\n",
|
||
" <td>3669</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>50.0</td>\n",
|
||
" <td>association</td>\n",
|
||
" <td>463</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>collège</td>\n",
|
||
" <td>446</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>49.0</td>\n",
|
||
" <td>ecole</td>\n",
|
||
" <td>374</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>lycée</td>\n",
|
||
" <td>275</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>53.0</td>\n",
|
||
" <td>centre social</td>\n",
|
||
" <td>200</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>53</th>\n",
|
||
" <td>130.0</td>\n",
|
||
" <td>cultures et arts</td>\n",
|
||
" <td>141</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>51.0</td>\n",
|
||
" <td>mairie</td>\n",
|
||
" <td>136</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>64.0</td>\n",
|
||
" <td>formation_ima_ax</td>\n",
|
||
" <td>87</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tag_id tag_name customer_id\n",
|
||
"147 298.0 jhima 4219\n",
|
||
"146 297.0 colloque algérie 3851\n",
|
||
"142 292.0 i&ma 3826\n",
|
||
"154 305.0 mardis de la philo 3674\n",
|
||
"150 301.0 le grand continant 3670\n",
|
||
"144 295.0 araborama 3669\n",
|
||
"155 306.0 marie descourtieux 3669\n",
|
||
"145 296.0 c'était la guerre d'algérie 3669\n",
|
||
"141 291.0 araborama 3 3669\n",
|
||
"102 198.0 association de collectivités territoriales spé... 3669\n",
|
||
"143 294.0 arabofolies 3669\n",
|
||
"103 199.0 rassemble les 11 000 élus de toute la france a... 3669\n",
|
||
"2 50.0 association 463\n",
|
||
"6 54.0 collège 446\n",
|
||
"1 49.0 ecole 374\n",
|
||
"7 55.0 lycée 275\n",
|
||
"5 53.0 centre social 200\n",
|
||
"53 130.0 cultures et arts 141\n",
|
||
"3 51.0 mairie 136\n",
|
||
"13 64.0 formation_ima_ax 87"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags_information(\"4\", 20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de client avec tag : 532342\n",
|
||
"Proportion de clients avec tags : 0.18660686931118298\n",
|
||
"Moyenne de tags par client : 24.114082676174338\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tag_id</th>\n",
|
||
" <th>tag_name</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>349.0</td>\n",
|
||
" <td>clients internet</td>\n",
|
||
" <td>517491</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>356.0</td>\n",
|
||
" <td>associations / clubs</td>\n",
|
||
" <td>495520</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>agence de voyages</td>\n",
|
||
" <td>493774</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32</th>\n",
|
||
" <td>410.0</td>\n",
|
||
" <td>guides conférenciers</td>\n",
|
||
" <td>493378</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>360.0</td>\n",
|
||
" <td>groupe amis ou famille</td>\n",
|
||
" <td>493021</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>354.0</td>\n",
|
||
" <td>ce / entreprises</td>\n",
|
||
" <td>493016</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>17.0</td>\n",
|
||
" <td>association/club</td>\n",
|
||
" <td>493008</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>c.e. / entreprise</td>\n",
|
||
" <td>492656</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>college</td>\n",
|
||
" <td>492552</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>69.0</td>\n",
|
||
" <td>tour operator</td>\n",
|
||
" <td>492549</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>ecole primaire</td>\n",
|
||
" <td>492540</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>31</th>\n",
|
||
" <td>379.0</td>\n",
|
||
" <td>parent goûter anniversaire</td>\n",
|
||
" <td>492468</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30</th>\n",
|
||
" <td>364.0</td>\n",
|
||
" <td>institutions</td>\n",
|
||
" <td>492364</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>institution</td>\n",
|
||
" <td>492321</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>186.0</td>\n",
|
||
" <td>autocaristes</td>\n",
|
||
" <td>492153</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>enseignement superieur</td>\n",
|
||
" <td>492131</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>359.0</td>\n",
|
||
" <td>hotels / campings</td>\n",
|
||
" <td>492078</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>42</th>\n",
|
||
" <td>7186.0</td>\n",
|
||
" <td>individuel</td>\n",
|
||
" <td>491913</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>groupe amis / famille</td>\n",
|
||
" <td>491900</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>client internet</td>\n",
|
||
" <td>491896</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tag_id tag_name customer_id\n",
|
||
"20 349.0 clients internet 517491\n",
|
||
"24 356.0 associations / clubs 495520\n",
|
||
"5 10.0 agence de voyages 493774\n",
|
||
"32 410.0 guides conférenciers 493378\n",
|
||
"26 360.0 groupe amis ou famille 493021\n",
|
||
"23 354.0 ce / entreprises 493016\n",
|
||
"8 17.0 association/club 493008\n",
|
||
"1 3.0 c.e. / entreprise 492656\n",
|
||
"6 11.0 college 492552\n",
|
||
"13 69.0 tour operator 492549\n",
|
||
"4 9.0 ecole primaire 492540\n",
|
||
"31 379.0 parent goûter anniversaire 492468\n",
|
||
"30 364.0 institutions 492364\n",
|
||
"2 6.0 institution 492321\n",
|
||
"18 186.0 autocaristes 492153\n",
|
||
"7 13.0 enseignement superieur 492131\n",
|
||
"25 359.0 hotels / campings 492078\n",
|
||
"42 7186.0 individuel 491913\n",
|
||
"3 7.0 groupe amis / famille 491900\n",
|
||
"0 2.0 client internet 491896"
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tags_information(\"101\", 20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Fin travail 25/02"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c437eaec",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"# Exemple sur Company 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a1c1fc39",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Chargement données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "66f8c17b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data/1\"\n",
|
||
"liste_database = fs.ls(BUCKET)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "c08e6798",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
|
||
"\n",
|
||
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
||
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
|
||
"\n",
|
||
"# Afficher le résultat\n",
|
||
"print(liste_database_filtered)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "675f518d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<<<<<<< local <modified: >\n",
|
||
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
"=======\n",
|
||
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
">>>>>>> remote <modified: >\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# loop to create dataframes from liste\n",
|
||
"files_path = liste_database\n",
|
||
"\n",
|
||
"client_number = files_path[0].split(\"/\")[1]\n",
|
||
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
"\n",
|
||
"for i in range(len(files_path)) :\n",
|
||
" current_path = files_path[i]\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
" # the pattern of the name is df1xxx\n",
|
||
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||
" globals()[nom_dataframe] = df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e855f403",
|
||
"metadata": {},
|
||
"source": [
|
||
"## customersplus.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "91a8f8c4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"a = pd.DataFrame(df1_customersplus.info())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "2fda171d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def info_colonnes_dataframe(df):\n",
|
||
" # Créer une liste pour stocker les informations sur chaque colonne\n",
|
||
" infos_colonnes = []\n",
|
||
"\n",
|
||
" # Parcourir les colonnes du DataFrame\n",
|
||
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
|
||
" # Calculer le taux de valeurs manquantes\n",
|
||
" taux_na = serie.isna().mean() * 100\n",
|
||
"\n",
|
||
" # Ajouter les informations à la liste\n",
|
||
" infos_colonnes.append({\n",
|
||
" 'Nom_colonne': nom_colonne,\n",
|
||
" 'Type_colonne': str(serie.dtype),\n",
|
||
" 'Taux_NA': taux_na\n",
|
||
" })\n",
|
||
"\n",
|
||
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
|
||
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
|
||
"\n",
|
||
" return df_infos_colonnes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "205eeeab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def cleaning_date(df, column_name):\n",
|
||
" \"\"\"\n",
|
||
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" - df: DataFrame\n",
|
||
" Le DataFrame contenant la colonne à nettoyer.\n",
|
||
" - column_name: str\n",
|
||
" Le nom de la colonne à nettoyer.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" - DataFrame\n",
|
||
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
||
" \"\"\"\n",
|
||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||
" return df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "634282c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"a = info_colonnes_dataframe(df1_customersplus)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "0e8d4133",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"a"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "1268ad5a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "bd41dc80",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_customersplus_clean = df1_customersplus.copy()\n",
|
||
"\n",
|
||
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
|
||
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
|
||
"\n",
|
||
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
||
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "64d0f76b",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## tickets.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "7e683711",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tickets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "e7b9a52e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tickets.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "568280e8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_tickets.isna().sum()/len(df1_tickets)*100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "29ecec90",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
|
||
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "22bb5de4",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## suppliers.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "6a9a91f4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "bab4758a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "b5fff251",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "8b09e2a3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
|
||
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "ecee7cdc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers_clean"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c8e6e69b",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## type_ofs.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "1a6cff1f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_type_ofs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "93630b41",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_type_ofs.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "4f94481a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
|
||
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1b2811e2",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## purchases.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "2455d2e1",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_purchases"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "5f9a159d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_purchases.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "db201bf7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Nettoyage purchase_date\n",
|
||
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
|
||
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "bd436fca",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_purchases.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "83435862",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Selection des variables\n",
|
||
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f210e730",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## Fusion de l'ensemble des données billétiques"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "1f8b3aa7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fusion avec fournisseurs\n",
|
||
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
"# Fusion avec type de tickets\n",
|
||
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
"# Fusion avec achats\n",
|
||
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "83a4d021",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56e6ebd1",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"# Utilisation de fonctions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"id": "88fcde4b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Créer un DataFrame exemple\n",
|
||
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
|
||
"\n",
|
||
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
|
||
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
|
||
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
|
||
"\n",
|
||
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
|
||
"\n",
|
||
"test.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "818f69db",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Nettoyage, selection et fusion"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "c9654eda",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_ticket_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "7f2b620c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_ticket_information.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "637bdb72",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Customer information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "14c52894",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## Target area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d83abfbf",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Target.csv cleaning\n",
|
||
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
|
||
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
|
||
"\n",
|
||
"# target_type cleaning\n",
|
||
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
|
||
"\n",
|
||
"#customer_target_mappings cleaning\n",
|
||
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
|
||
"\n",
|
||
"# Merge target et target_type\n",
|
||
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
|
||
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
|
||
"\n",
|
||
"# Merge\n",
|
||
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
|
||
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"id": "90d71b2c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
|
||
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
|
||
"\n",
|
||
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
|
||
"df1_targets_test.mean()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "2301de1e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>customer_id</th>\n",
|
||
" <th>target_name</th>\n",
|
||
" <th>target_type_is_import</th>\n",
|
||
" <th>target_type_name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1184824</td>\n",
|
||
" <td>645400</td>\n",
|
||
" <td>DDCP PROMO Réseau livres</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>210571</td>\n",
|
||
" <td>2412</td>\n",
|
||
" <td>DDCP PROMO Réseau livres</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>210572</td>\n",
|
||
" <td>4536</td>\n",
|
||
" <td>DDCP PROMO Réseau livres</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>210573</td>\n",
|
||
" <td>6736</td>\n",
|
||
" <td>DDCP PROMO Réseau livres</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>210574</td>\n",
|
||
" <td>38210</td>\n",
|
||
" <td>DDCP PROMO Réseau livres</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>manual_static_filter</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id customer_id target_name target_type_is_import \\\n",
|
||
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
|
||
"1 210571 2412 DDCP PROMO Réseau livres False \n",
|
||
"2 210572 4536 DDCP PROMO Réseau livres False \n",
|
||
"3 210573 6736 DDCP PROMO Réseau livres False \n",
|
||
"4 210574 38210 DDCP PROMO Réseau livres False \n",
|
||
"\n",
|
||
" target_type_name \n",
|
||
"0 manual_static_filter \n",
|
||
"1 manual_static_filter \n",
|
||
"2 manual_static_filter \n",
|
||
"3 manual_static_filter \n",
|
||
"4 manual_static_filter "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_targets_full.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "75fbc2f7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package punkt is already up-to-date!\n",
|
||
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package stopwords is already up-to-date!\n",
|
||
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"True"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Catégorisation des target_name\n",
|
||
"import pandas as pd\n",
|
||
"import nltk\n",
|
||
"from nltk.tokenize import word_tokenize\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"from nltk.stem import WordNetLemmatizer\n",
|
||
"from nltk.probability import FreqDist\n",
|
||
"\n",
|
||
"# Téléchargement des ressources nécessaires\n",
|
||
"nltk.download('punkt')\n",
|
||
"nltk.download('stopwords')\n",
|
||
"nltk.download('wordnet')\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "55cddf92",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Mots les plus fréquents:\n",
|
||
"consentement: 550777\n",
|
||
"optin: 463579\n",
|
||
"jeune: 155103\n",
|
||
"public: 155103\n",
|
||
"mediation: 150001\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
|
||
"def preprocess_text(texte):\n",
|
||
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
|
||
" texte_concat = ' '.join(texte)\n",
|
||
" \n",
|
||
" # Tokenisation des mots\n",
|
||
" tokens = word_tokenize(texte_concat.lower())\n",
|
||
" \n",
|
||
" # Suppression des mots vides (stopwords)\n",
|
||
" stop_words = set(stopwords.words('french'))\n",
|
||
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
|
||
" \n",
|
||
" # Lemmatisation des mots\n",
|
||
" lemmatizer = WordNetLemmatizer()\n",
|
||
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
|
||
" \n",
|
||
" return lemmatized_tokens\n",
|
||
"\n",
|
||
"\n",
|
||
"# Appliquer le prétraitement à la colonne de texte\n",
|
||
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
|
||
"\n",
|
||
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
|
||
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
|
||
"\n",
|
||
"# Calculer la fréquence des mots\n",
|
||
"freq_dist = FreqDist(all_words)\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "7fd98a85",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Mots les plus fréquents:\n",
|
||
"consentement: 550777\n",
|
||
"optin: 463579\n",
|
||
"jeune: 155103\n",
|
||
"public: 155103\n",
|
||
"mediation: 150001\n",
|
||
"specialisee: 150001\n",
|
||
"b2c: 143432\n",
|
||
"optout: 97683\n",
|
||
"newsletter: 56022\n",
|
||
"(: 46084\n",
|
||
"): 46084\n",
|
||
"inscrits: 42296\n",
|
||
"nl: 42294\n",
|
||
"générale: 41037\n",
|
||
"generale: 40950\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Affichage des mots les plus fréquents\n",
|
||
"print(\"Mots les plus fréquents:\")\n",
|
||
"for mot, freq in freq_dist.most_common(15):\n",
|
||
" print(f\"{mot}: {freq}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "cf94bb1d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" texte \\\n",
|
||
"0 Le chat noir mange une souris. \n",
|
||
"1 Le chien blanc aboie. \n",
|
||
"\n",
|
||
" texte_preprocessed \n",
|
||
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
|
||
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package punkt is already up-to-date!\n",
|
||
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package stopwords is already up-to-date!\n",
|
||
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
|
||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import nltk\n",
|
||
"from nltk.tokenize import word_tokenize\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"from nltk.stem import WordNetLemmatizer\n",
|
||
"\n",
|
||
"# Téléchargement des ressources nécessaires\n",
|
||
"nltk.download('punkt')\n",
|
||
"nltk.download('stopwords')\n",
|
||
"nltk.download('wordnet')\n",
|
||
"\n",
|
||
"# Création de la DataFrame d'exemple\n",
|
||
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
|
||
"df = pd.DataFrame(data)\n",
|
||
"\n",
|
||
"# Fonction pour prétraiter le texte\n",
|
||
"def preprocess_text(texte):\n",
|
||
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
|
||
" texte_concat = ' '.join(texte)\n",
|
||
" \n",
|
||
" # Tokenisation des mots\n",
|
||
" tokens = word_tokenize(texte_concat.lower())\n",
|
||
" \n",
|
||
" # Suppression des mots vides (stopwords)\n",
|
||
" stop_words = set(stopwords.words('french'))\n",
|
||
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
|
||
" \n",
|
||
" # Lemmatisation des mots\n",
|
||
" lemmatizer = WordNetLemmatizer()\n",
|
||
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
|
||
" \n",
|
||
" return lemmatized_tokens\n",
|
||
"\n",
|
||
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
|
||
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
|
||
"\n",
|
||
"# Afficher le résultat\n",
|
||
"print(df)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "711d3884",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## Campaign area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"id": "c25b5295",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# campaign_stats cleaning \n",
|
||
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
|
||
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
|
||
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
|
||
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
|
||
"\n",
|
||
"# campaigns cleaning\n",
|
||
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
|
||
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
|
||
"\n",
|
||
"# Merge \n",
|
||
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
|
||
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "2a3de6a5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_campaigns_full.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"id": "3fc1f446",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_campaigns_information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "20e69ee3",
|
||
"metadata": {
|
||
"jp-MarkdownHeadingCollapsed": true
|
||
},
|
||
"source": [
|
||
"## Link area"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "d9cbdbce",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_campaigns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"id": "c07459f0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_link_stats"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "80ae4c42",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Exploration variables"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "b50b8f95",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
|
||
"def suppliers_exploration(suppliers = None) : \n",
|
||
" \n",
|
||
" # Taux de NaN pour ces colonnes\n",
|
||
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
|
||
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
|
||
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
|
||
"\n",
|
||
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
|
||
" 'label_na' : [label_na],\n",
|
||
" 'itr_na' : [itr_na],\n",
|
||
" 'commission_na' : [commission_na]})\n",
|
||
"\n",
|
||
" return suppliers_desc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "7e292935",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "05b6f2b0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>label_na</th>\n",
|
||
" <th>itr_na</th>\n",
|
||
" <th>commission_na</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" nb_suppliers label_na itr_na commission_na\n",
|
||
"0 9 100.0 100.0 100.0"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1_suppliers_desc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "c9324d80",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"BUCKET = \"bdc2324-data\"\n",
|
||
"liste_folders = fs.ls(BUCKET)\n",
|
||
"\n",
|
||
"liste_files = []\n",
|
||
"for company_folder in liste_folders : \n",
|
||
" liste_files.extend(fs.ls(company_folder))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "10304058",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"liste_database_select = ['suppliers']\n",
|
||
"\n",
|
||
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
||
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
|
||
"\n",
|
||
"# Afficher le résultat\n",
|
||
"print(liste_suppliers)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "ffa423e5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# loop to create dataframes from file 2\n",
|
||
"def database_loading(database_name = None):\n",
|
||
" files_path = database_name\n",
|
||
" \n",
|
||
" client_number = files_path.split(\"/\")[1]\n",
|
||
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||
" \n",
|
||
" current_path = files_path\n",
|
||
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||
" df = pd.read_csv(file_in)\n",
|
||
"\n",
|
||
" return df, client_number"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "70bdc88d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"id": "6a0f567d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df_all = pd.DataFrame()\n",
|
||
"\n",
|
||
"for link in liste_suppliers:\n",
|
||
" \n",
|
||
" df_supplier, tenant_id = database_loading(link)\n",
|
||
" \n",
|
||
" df_supplier['tenant_id'] = int(tenant_id)\n",
|
||
"\n",
|
||
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"id": "1522d8cd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 66,
|
||
"id": "b0e42a61",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
|
||
"# vad = vente à distance\n",
|
||
"df_all['name'] = df_all['name'].fillna('')\n",
|
||
"\n",
|
||
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"id": "d299ae91",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"tenant_id\n",
|
||
"1 1\n",
|
||
"2 1\n",
|
||
"3 1\n",
|
||
"4 1\n",
|
||
"5 1\n",
|
||
"6 1\n",
|
||
"7 1\n",
|
||
"8 1\n",
|
||
"9 1\n",
|
||
"10 1\n",
|
||
"11 1\n",
|
||
"12 1\n",
|
||
"13 1\n",
|
||
"14 1\n",
|
||
"101 1\n",
|
||
"Name: canal_vente_internet, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|