Project_Carmignac/Clustering.ipynb

4666 lines
738 KiB
Plaintext
Raw Normal View History

2026-04-03 10:55:04 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 43,
"id": "7aa09644-4d17-4a7a-841e-3bfcfb8a8901",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fichiers Flows : ['projet-bdc-data/carmignac/Flows ENSAE V1 -20251027.csv', 'projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv']\n",
"Fichiers AUM : ['projet-bdc-data/carmignac/AUM ENSAE V1 -20251027.csv', 'projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv']\n"
]
}
],
"source": [
"# Import des données\n",
"\n",
"import os\n",
"import s3fs\n",
"import pandas as pd\n",
"\n",
"s3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': s3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"projet-bdc-data\"\n",
"carmignac_path = \"projet-bdc-data/carmignac\"\n",
"\n",
"# Liste des fichiers FLOWS\n",
"all_files = fs.ls(carmignac_path)\n",
"flows_files = [f for f in all_files if \"Flows\" in f and f.endswith(\".csv\")]\n",
"print(\"Fichiers Flows :\", flows_files)\n",
"\n",
"# Lire tous les fichiers dans un dictionnaire\n",
"flows_data = {}\n",
"for file_path in flows_files:\n",
" with fs.open(file_path, 'r') as f:\n",
" df = pd.read_csv(f, sep=';',low_memory=False)\n",
" flows_data[os.path.basename(file_path)] = df\n",
"\n",
"\n",
"# Liste des fichiers AUM\n",
"all_files = fs.ls(carmignac_path)\n",
"aum_files = [f for f in all_files if \"AUM\" in f and f.endswith(\".csv\")]\n",
"print(\"Fichiers AUM :\", aum_files)\n",
"\n",
"# Lire tous les fichiers dans un dictionnaire\n",
"aum_data = {}\n",
"for file_path in aum_files:\n",
" with fs.open(file_path, 'r') as f:\n",
" df = pd.read_csv(f, sep=';',low_memory=False)\n",
" aum_data[os.path.basename(file_path)] = df\n",
"\n",
"df = aum_data['AUM ENSAE V2 -20251105.csv']\n",
"dg = flows_data['Flows ENSAE V2 -20251105.csv']"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "35bb08c3-873a-462b-879d-dde601388d8f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Agreement - Code</th>\n",
" <th>Company - Id</th>\n",
" <th>Company - Ultimate Parent Id</th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Registrar Account - Region</th>\n",
" <th>RegistrarAccount - Country</th>\n",
" <th>Product - Asset Type</th>\n",
" <th>Product - Strategy</th>\n",
" <th>Product - Legal Status</th>\n",
" <th>Product - Is Dedie ?</th>\n",
" <th>Product - Fund</th>\n",
" <th>Product - Shareclass Type</th>\n",
" <th>Product - Shareclass Currency</th>\n",
" <th>Product - Isin</th>\n",
" <th>Centralisation Date</th>\n",
" <th>Quantity - AUM</th>\n",
" <th>Value - AUM CCY</th>\n",
" <th>Value - AUM €</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200000647</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>Carmignac Patrimoine</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-03-31</td>\n",
" <td>35.368</td>\n",
" <td>24648.6666</td>\n",
" <td>24648.6666</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200000647</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>Carmignac Patrimoine</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22413.0553</td>\n",
" <td>22413.0553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200000647</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>Carmignac Patrimoine</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-12-31</td>\n",
" <td>35.368</td>\n",
" <td>22051.2406</td>\n",
" <td>22051.2406</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200000647</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>Carmignac Patrimoine</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-03-31</td>\n",
" <td>35.368</td>\n",
" <td>21626.1173</td>\n",
" <td>21626.1173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200000647</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>Carmignac Patrimoine</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22489.4502</td>\n",
" <td>22489.4502</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Agreement - Code Company - Id Company - Ultimate Parent Id \\\n",
"0 003 166 166 \n",
"1 003 166 166 \n",
"2 003 166 166 \n",
"3 003 166 166 \n",
"4 003 166 166 \n",
"\n",
" Registrar Account - ID Registrar Account - Region \\\n",
"0 200000647 France \n",
"1 200000647 France \n",
"2 200000647 France \n",
"3 200000647 France \n",
"4 200000647 France \n",
"\n",
" RegistrarAccount - Country Product - Asset Type Product - Strategy \\\n",
"0 France Diversified Patrimoine \n",
"1 France Diversified Patrimoine \n",
"2 France Diversified Patrimoine \n",
"3 France Diversified Patrimoine \n",
"4 France Diversified Patrimoine \n",
"\n",
" Product - Legal Status Product - Is Dedie ? Product - Fund \\\n",
"0 FCP NO Carmignac Patrimoine \n",
"1 FCP NO Carmignac Patrimoine \n",
"2 FCP NO Carmignac Patrimoine \n",
"3 FCP NO Carmignac Patrimoine \n",
"4 FCP NO Carmignac Patrimoine \n",
"\n",
" Product - Shareclass Type Product - Shareclass Currency Product - Isin \\\n",
"0 A EUR FR0010135103 \n",
"1 A EUR FR0010135103 \n",
"2 A EUR FR0010135103 \n",
"3 A EUR FR0010135103 \n",
"4 A EUR FR0010135103 \n",
"\n",
" Centralisation Date Quantity - AUM Value - AUM CCY Value - AUM € \n",
"0 2015-03-31 35.368 24648.6666 24648.6666 \n",
"1 2015-11-30 35.368 22413.0553 22413.0553 \n",
"2 2015-12-31 35.368 22051.2406 22051.2406 \n",
"3 2016-03-31 35.368 21626.1173 21626.1173 \n",
"4 2016-11-30 35.368 22489.4502 22489.4502 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "d5262683-6ae5-4ee6-b949-58a468c7c7b5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Agreement - Code</th>\n",
" <th>Company - Id</th>\n",
" <th>Company - Ultimate Parent Id</th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Registrar Account - Region</th>\n",
" <th>RegistrarAccount - Country</th>\n",
" <th>Product - Asset Type</th>\n",
" <th>Product - Strategy</th>\n",
" <th>Product - Legal Status</th>\n",
" <th>Product - Is Dedie ?</th>\n",
" <th>...</th>\n",
" <th>Centralisation Date</th>\n",
" <th>Quantity - Subscription</th>\n",
" <th>Quantity - Redemption</th>\n",
" <th>Quantity - NetFlows</th>\n",
" <th>Value Ccy - Subscription</th>\n",
" <th>Value Ccy - Redemption</th>\n",
" <th>Value Ccy - NetFlows</th>\n",
" <th>Value € - Subscription</th>\n",
" <th>Value € - Redemption</th>\n",
" <th>Value € - NetFlows</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200127202</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Equity</td>\n",
" <td>Investissement</td>\n",
" <td>SICAV</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2020-11-05</td>\n",
" <td>1636.000</td>\n",
" <td>0.000</td>\n",
" <td>1636.000</td>\n",
" <td>280983.00</td>\n",
" <td>0.00</td>\n",
" <td>280983.00</td>\n",
" <td>280983.00</td>\n",
" <td>0.00</td>\n",
" <td>280983.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Diversified</td>\n",
" <td>Patrimoine</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2015-03-09</td>\n",
" <td>144.690</td>\n",
" <td>0.000</td>\n",
" <td>144.690</td>\n",
" <td>99985.13</td>\n",
" <td>0.00</td>\n",
" <td>99985.13</td>\n",
" <td>99985.13</td>\n",
" <td>0.00</td>\n",
" <td>99985.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Equity</td>\n",
" <td>Investissement</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2016-10-26</td>\n",
" <td>0.000</td>\n",
" <td>-8.321</td>\n",
" <td>-8.321</td>\n",
" <td>0.00</td>\n",
" <td>-9384.76</td>\n",
" <td>-9384.76</td>\n",
" <td>0.00</td>\n",
" <td>-9384.76</td>\n",
" <td>-9384.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Equity</td>\n",
" <td>Investissement</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2018-10-18</td>\n",
" <td>0.000</td>\n",
" <td>-22.083</td>\n",
" <td>-22.083</td>\n",
" <td>0.00</td>\n",
" <td>-25227.40</td>\n",
" <td>-25227.40</td>\n",
" <td>0.00</td>\n",
" <td>-25227.40</td>\n",
" <td>-25227.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>France</td>\n",
" <td>France</td>\n",
" <td>Equity</td>\n",
" <td>Investissement</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2019-04-08</td>\n",
" <td>0.000</td>\n",
" <td>-465.992</td>\n",
" <td>-465.992</td>\n",
" <td>0.00</td>\n",
" <td>-563775.76</td>\n",
" <td>-563775.76</td>\n",
" <td>0.00</td>\n",
" <td>-563775.76</td>\n",
" <td>-563775.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574456</th>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Luxembourg</td>\n",
" <td>Luxembourg</td>\n",
" <td>Fixed Income</td>\n",
" <td>Sécurité</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2015-06-12</td>\n",
" <td>0.000</td>\n",
" <td>-20.000</td>\n",
" <td>-20.000</td>\n",
" <td>0.00</td>\n",
" <td>-34294.40</td>\n",
" <td>-34294.40</td>\n",
" <td>0.00</td>\n",
" <td>-34294.40</td>\n",
" <td>-34294.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574457</th>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Luxembourg</td>\n",
" <td>Luxembourg</td>\n",
" <td>Fixed Income</td>\n",
" <td>Sécurité</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2015-09-18</td>\n",
" <td>328.726</td>\n",
" <td>0.000</td>\n",
" <td>328.726</td>\n",
" <td>564028.07</td>\n",
" <td>0.00</td>\n",
" <td>564028.07</td>\n",
" <td>564028.07</td>\n",
" <td>0.00</td>\n",
" <td>564028.07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574458</th>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Luxembourg</td>\n",
" <td>Luxembourg</td>\n",
" <td>Fixed Income</td>\n",
" <td>Sécurité</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2015-09-25</td>\n",
" <td>4.443</td>\n",
" <td>0.000</td>\n",
" <td>4.443</td>\n",
" <td>7603.66</td>\n",
" <td>0.00</td>\n",
" <td>7603.66</td>\n",
" <td>7603.66</td>\n",
" <td>0.00</td>\n",
" <td>7603.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574459</th>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Luxembourg</td>\n",
" <td>Luxembourg</td>\n",
" <td>Fixed Income</td>\n",
" <td>Sécurité</td>\n",
" <td>FCP</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2015-11-09</td>\n",
" <td>0.000</td>\n",
" <td>-440.000</td>\n",
" <td>-440.000</td>\n",
" <td>0.00</td>\n",
" <td>-754696.80</td>\n",
" <td>-754696.80</td>\n",
" <td>0.00</td>\n",
" <td>-754696.80</td>\n",
" <td>-754696.80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574460</th>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Private Client</td>\n",
" <td>Luxembourg</td>\n",
" <td>Luxembourg</td>\n",
" <td>Fixed Income</td>\n",
" <td>Sécurité</td>\n",
" <td>SICAV</td>\n",
" <td>NO</td>\n",
" <td>...</td>\n",
" <td>2016-01-11</td>\n",
" <td>3595.000</td>\n",
" <td>0.000</td>\n",
" <td>3595.000</td>\n",
" <td>358385.55</td>\n",
" <td>0.00</td>\n",
" <td>358385.55</td>\n",
" <td>358385.55</td>\n",
" <td>0.00</td>\n",
" <td>358385.55</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2574461 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" Agreement - Code Company - Id Company - Ultimate Parent Id \\\n",
"0 003 166 166 \n",
"1 003 166 166 \n",
"2 003 166 166 \n",
"3 003 166 166 \n",
"4 003 166 166 \n",
"... ... ... ... \n",
"2574456 Private Client Private Client Private Client \n",
"2574457 Private Client Private Client Private Client \n",
"2574458 Private Client Private Client Private Client \n",
"2574459 Private Client Private Client Private Client \n",
"2574460 Private Client Private Client Private Client \n",
"\n",
" Registrar Account - ID Registrar Account - Region \\\n",
"0 200127202 France \n",
"1 406533 France \n",
"2 406533 France \n",
"3 406533 France \n",
"4 406533 France \n",
"... ... ... \n",
"2574456 Private Client Luxembourg \n",
"2574457 Private Client Luxembourg \n",
"2574458 Private Client Luxembourg \n",
"2574459 Private Client Luxembourg \n",
"2574460 Private Client Luxembourg \n",
"\n",
" RegistrarAccount - Country Product - Asset Type Product - Strategy \\\n",
"0 France Equity Investissement \n",
"1 France Diversified Patrimoine \n",
"2 France Equity Investissement \n",
"3 France Equity Investissement \n",
"4 France Equity Investissement \n",
"... ... ... ... \n",
"2574456 Luxembourg Fixed Income Sécurité \n",
"2574457 Luxembourg Fixed Income Sécurité \n",
"2574458 Luxembourg Fixed Income Sécurité \n",
"2574459 Luxembourg Fixed Income Sécurité \n",
"2574460 Luxembourg Fixed Income Sécurité \n",
"\n",
" Product - Legal Status Product - Is Dedie ? ... Centralisation Date \\\n",
"0 SICAV NO ... 2020-11-05 \n",
"1 FCP NO ... 2015-03-09 \n",
"2 FCP NO ... 2016-10-26 \n",
"3 FCP NO ... 2018-10-18 \n",
"4 FCP NO ... 2019-04-08 \n",
"... ... ... ... ... \n",
"2574456 FCP NO ... 2015-06-12 \n",
"2574457 FCP NO ... 2015-09-18 \n",
"2574458 FCP NO ... 2015-09-25 \n",
"2574459 FCP NO ... 2015-11-09 \n",
"2574460 SICAV NO ... 2016-01-11 \n",
"\n",
" Quantity - Subscription Quantity - Redemption Quantity - NetFlows \\\n",
"0 1636.000 0.000 1636.000 \n",
"1 144.690 0.000 144.690 \n",
"2 0.000 -8.321 -8.321 \n",
"3 0.000 -22.083 -22.083 \n",
"4 0.000 -465.992 -465.992 \n",
"... ... ... ... \n",
"2574456 0.000 -20.000 -20.000 \n",
"2574457 328.726 0.000 328.726 \n",
"2574458 4.443 0.000 4.443 \n",
"2574459 0.000 -440.000 -440.000 \n",
"2574460 3595.000 0.000 3595.000 \n",
"\n",
" Value Ccy - Subscription Value Ccy - Redemption \\\n",
"0 280983.00 0.00 \n",
"1 99985.13 0.00 \n",
"2 0.00 -9384.76 \n",
"3 0.00 -25227.40 \n",
"4 0.00 -563775.76 \n",
"... ... ... \n",
"2574456 0.00 -34294.40 \n",
"2574457 564028.07 0.00 \n",
"2574458 7603.66 0.00 \n",
"2574459 0.00 -754696.80 \n",
"2574460 358385.55 0.00 \n",
"\n",
" Value Ccy - NetFlows Value € - Subscription Value € - Redemption \\\n",
"0 280983.00 280983.00 0.00 \n",
"1 99985.13 99985.13 0.00 \n",
"2 -9384.76 0.00 -9384.76 \n",
"3 -25227.40 0.00 -25227.40 \n",
"4 -563775.76 0.00 -563775.76 \n",
"... ... ... ... \n",
"2574456 -34294.40 0.00 -34294.40 \n",
"2574457 564028.07 564028.07 0.00 \n",
"2574458 7603.66 7603.66 0.00 \n",
"2574459 -754696.80 0.00 -754696.80 \n",
"2574460 358385.55 358385.55 0.00 \n",
"\n",
" Value € - NetFlows \n",
"0 280983.00 \n",
"1 99985.13 \n",
"2 -9384.76 \n",
"3 -25227.40 \n",
"4 -563775.76 \n",
"... ... \n",
"2574456 -34294.40 \n",
"2574457 564028.07 \n",
"2574458 7603.66 \n",
"2574459 -754696.80 \n",
"2574460 358385.55 \n",
"\n",
"[2574461 rows x 24 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dg.head()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "b31d3cb3-479c-4d2b-b6f5-8f78cd69407a",
"metadata": {},
"outputs": [],
"source": [
"# Filtrer les comptes techniques\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"df['Centralisation Date'] = pd.to_datetime(df['Centralisation Date'])\n",
"dg['Centralisation Date'] = pd.to_datetime(dg['Centralisation Date'])\n",
"df = df[~df['Registrar Account - ID'].isin(['Off Distribution','Private Clients', 'Private Client'])]\n",
"dg = dg[~dg['Registrar Account - ID'].isin(['Off Distribution','Private Clients','Private Client'])]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "dec37ff8-0f54-4e3e-ac63-a71f9b3583d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(431, 2)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Value - AUM €</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3890</th>\n",
" <td>420350</td>\n",
" <td>1.623308e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2622</th>\n",
" <td>364765</td>\n",
" <td>1.383209e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>956</th>\n",
" <td>200127454</td>\n",
" <td>8.784361e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2598</th>\n",
" <td>312933</td>\n",
" <td>8.379604e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1099</th>\n",
" <td>200127809</td>\n",
" <td>8.342839e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3880</th>\n",
" <td>420259</td>\n",
" <td>8.296663e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2634</th>\n",
" <td>364907</td>\n",
" <td>8.151083e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2785</th>\n",
" <td>366441</td>\n",
" <td>7.707213e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2637</th>\n",
" <td>364929</td>\n",
" <td>7.479766e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2710</th>\n",
" <td>365538</td>\n",
" <td>7.200408e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2775</th>\n",
" <td>366403</td>\n",
" <td>7.092081e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3818</th>\n",
" <td>418961</td>\n",
" <td>6.529718e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>336</th>\n",
" <td>200058108</td>\n",
" <td>6.110961e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1764</th>\n",
" <td>200131722</td>\n",
" <td>5.758019e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>409</th>\n",
" <td>200073354</td>\n",
" <td>4.619978e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2741</th>\n",
" <td>365848</td>\n",
" <td>4.563625e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>423</th>\n",
" <td>200075932</td>\n",
" <td>4.375607e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>925</th>\n",
" <td>200127410</td>\n",
" <td>3.920364e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>872</th>\n",
" <td>200127316</td>\n",
" <td>3.707238e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>200001349</td>\n",
" <td>3.650226e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID Value - AUM €\n",
"3890 420350 1.623308e+09\n",
"2622 364765 1.383209e+09\n",
"956 200127454 8.784361e+08\n",
"2598 312933 8.379604e+08\n",
"1099 200127809 8.342839e+08\n",
"3880 420259 8.296663e+08\n",
"2634 364907 8.151083e+08\n",
"2785 366441 7.707213e+08\n",
"2637 364929 7.479766e+08\n",
"2710 365538 7.200408e+08\n",
"2775 366403 7.092081e+08\n",
"3818 418961 6.529718e+08\n",
"336 200058108 6.110961e+08\n",
"1764 200131722 5.758019e+08\n",
"409 200073354 4.619978e+08\n",
"2741 365848 4.563625e+08\n",
"423 200075932 4.375607e+08\n",
"925 200127410 3.920364e+08\n",
"872 200127316 3.707238e+08\n",
"74 200001349 3.650226e+08"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Date de référence et sélection des 400+ principaux codes\n",
"\n",
"ref_date = pd.Timestamp('2025-10-31')\n",
"\n",
"df_ref = df[df['Centralisation Date'] == ref_date]\n",
"\n",
"aum_account = (\n",
" df_ref\n",
" .groupby('Registrar Account - ID')['Value - AUM €']\n",
" .sum()\n",
" .reset_index()\n",
" .sort_values(by='Value - AUM €', ascending=False)\n",
")\n",
"aum_account = aum_account[aum_account['Value - AUM €'] > 5_000_000]\n",
"selected_accounts = aum_account['Registrar Account - ID']\n",
"\n",
"print(aum_account.shape)\n",
"aum_account.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "95238a61-c263-43b4-965f-e09226ec4c73",
"metadata": {},
"outputs": [],
"source": [
"df_aum = df[df['Registrar Account - ID'].isin(selected_accounts)].copy()\n",
"df_flows = dg[dg['Registrar Account - ID'].isin(selected_accounts)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "a6dcac05-e9f9-4c25-ade9-7b1f9e35582f",
"metadata": {},
"outputs": [],
"source": [
"# Clustering\n",
"\n",
"# Parse dates\n",
"df_flows[\"Centralisation Date\"] = pd.to_datetime(df_flows[\"Centralisation Date\"], errors=\"coerce\")\n",
"df_aum[\"Centralisation Date\"] = pd.to_datetime(df_aum[\"Centralisation Date\"], errors=\"coerce\")\n",
"\n",
"ID_COL = \"Registrar Account - ID\"\n",
"FLOW_COL = \"Quantity - NetFlows\"\n",
"AUM_COL = \"Quantity - AUM\"\n",
"\n",
"# Month key\n",
"df_flows[\"month\"] = df_flows[\"Centralisation Date\"].dt.to_period(\"M\").dt.to_timestamp(\"M\")\n",
"df_aum[\"month\"] = df_aum[\"Centralisation Date\"].dt.to_period(\"M\").dt.to_timestamp(\"M\")\n",
"# Flows sont journaliers, AUM est mensuel → il faut une granularité commune."
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "5ea26597-af38-41f1-9cde-e9cd115e8678",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(33972, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>month</th>\n",
" <th>aum_qty</th>\n",
" <th>net_flow_qty</th>\n",
" <th>gross_flow_qty</th>\n",
" <th>n_tx</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18872</td>\n",
" <td>2015-01-31</td>\n",
" <td>179864.637</td>\n",
" <td>-1524.010</td>\n",
" <td>15230.010</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18872</td>\n",
" <td>2015-02-28</td>\n",
" <td>186761.736</td>\n",
" <td>7247.100</td>\n",
" <td>18571.880</td>\n",
" <td>38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>18872</td>\n",
" <td>2015-03-31</td>\n",
" <td>190357.718</td>\n",
" <td>3655.380</td>\n",
" <td>9754.040</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18872</td>\n",
" <td>2015-04-30</td>\n",
" <td>191429.324</td>\n",
" <td>-218.394</td>\n",
" <td>12840.950</td>\n",
" <td>39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>18872</td>\n",
" <td>2015-05-31</td>\n",
" <td>189056.475</td>\n",
" <td>-4782.849</td>\n",
" <td>6332.849</td>\n",
" <td>24</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID month aum_qty net_flow_qty gross_flow_qty \\\n",
"0 18872 2015-01-31 179864.637 -1524.010 15230.010 \n",
"1 18872 2015-02-28 186761.736 7247.100 18571.880 \n",
"2 18872 2015-03-31 190357.718 3655.380 9754.040 \n",
"3 18872 2015-04-30 191429.324 -218.394 12840.950 \n",
"4 18872 2015-05-31 189056.475 -4782.849 6332.849 \n",
"\n",
" n_tx \n",
"0 32 \n",
"1 38 \n",
"2 47 \n",
"3 39 \n",
"4 24 "
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1) Monthly aggregation for FLOWS : je fais mon etude mensuel parce que aum valeur mensuel \n",
"\n",
"ID_COL = \"Registrar Account - ID\"\n",
"FLOW_COL = \"Quantity - NetFlows\"\n",
"AUM_COL = \"Quantity - AUM\"\n",
"\n",
"df_flows_m = (\n",
" df_flows\n",
" .dropna(subset=[ID_COL, \"month\", FLOW_COL])\n",
" .assign(gross_flow_qty=lambda x: x[FLOW_COL].abs()) # absolute quantity moved\n",
" .groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" net_flow_qty=(FLOW_COL, \"sum\"), # net quantity change over the month\n",
" gross_flow_qty=(\"gross_flow_qty\", \"sum\"), # total traded quantity (activity intensity)\n",
" n_tx=(FLOW_COL, \"size\"), # number of transactions\n",
" )\n",
")\n",
"\n",
"# 2) Monthly aggregation for AUM (client-month holdings) ---\n",
"df_aum_m = (\n",
" df_aum\n",
" .dropna(subset=[ID_COL, \"month\", AUM_COL])\n",
" .groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(aum_qty=(AUM_COL, \"sum\")) # total held quantity across ISINs\n",
")\n",
"\n",
"df_month = df_aum_m.merge(df_flows_m, on=[ID_COL, \"month\"], how=\"left\")\n",
"\n",
"# 4) Months without transactions => flows are 0 ---\n",
"df_month[\"net_flow_qty\"] = df_month[\"net_flow_qty\"].fillna(0.0)\n",
"df_month[\"gross_flow_qty\"] = df_month[\"gross_flow_qty\"].fillna(0.0)\n",
"df_month[\"n_tx\"] = df_month[\"n_tx\"].fillna(0).astype(int)\n",
"\n",
"print(df_month.shape)\n",
"df_month.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "f9c7a0e3-b15a-4404-a99d-b23894cbd3f4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_months</th>\n",
" <th>n_active_months</th>\n",
" <th>flow_freq</th>\n",
" <th>aum_qty_mean</th>\n",
" <th>aum_qty_median</th>\n",
" <th>net_flow_qty_sum</th>\n",
" <th>gross_flow_qty_sum</th>\n",
" <th>gross_flow_qty_mean</th>\n",
" <th>net_flow_qty_vol</th>\n",
" <th>rel_intensity</th>\n",
" <th>netflow_to_aum</th>\n",
" <th>n_tx_total</th>\n",
" <th>log_aum_qty_mean</th>\n",
" <th>log_gross_flow_qty_mean</th>\n",
" <th>gross_flow_to_aum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18872</td>\n",
" <td>130</td>\n",
" <td>130</td>\n",
" <td>1.000000</td>\n",
" <td>143505.697354</td>\n",
" <td>144653.1645</td>\n",
" <td>-45677.1480</td>\n",
" <td>1.244126e+06</td>\n",
" <td>9570.200015</td>\n",
" <td>9832.357264</td>\n",
" <td>0.069449</td>\n",
" <td>-0.003918</td>\n",
" <td>1926</td>\n",
" <td>11.874137</td>\n",
" <td>9.166514</td>\n",
" <td>8.669523</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>200000076</td>\n",
" <td>130</td>\n",
" <td>119</td>\n",
" <td>0.915385</td>\n",
" <td>24141.541138</td>\n",
" <td>19888.8255</td>\n",
" <td>54791.9840</td>\n",
" <td>2.314415e+05</td>\n",
" <td>1780.319492</td>\n",
" <td>2838.000232</td>\n",
" <td>0.083230</td>\n",
" <td>-0.000893</td>\n",
" <td>518</td>\n",
" <td>10.091731</td>\n",
" <td>7.485110</td>\n",
" <td>9.586858</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>200000082</td>\n",
" <td>130</td>\n",
" <td>130</td>\n",
" <td>1.000000</td>\n",
" <td>422994.464523</td>\n",
" <td>462973.7880</td>\n",
" <td>178371.1590</td>\n",
" <td>2.327246e+06</td>\n",
" <td>17901.894469</td>\n",
" <td>13288.481111</td>\n",
" <td>0.047480</td>\n",
" <td>0.005194</td>\n",
" <td>7103</td>\n",
" <td>12.955117</td>\n",
" <td>9.792718</td>\n",
" <td>5.501836</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>200000146</td>\n",
" <td>130</td>\n",
" <td>130</td>\n",
" <td>1.000000</td>\n",
" <td>212108.397869</td>\n",
" <td>210616.5330</td>\n",
" <td>457533.3310</td>\n",
" <td>1.150546e+06</td>\n",
" <td>8850.350438</td>\n",
" <td>10074.748210</td>\n",
" <td>0.051622</td>\n",
" <td>0.024910</td>\n",
" <td>4774</td>\n",
" <td>12.264857</td>\n",
" <td>9.088325</td>\n",
" <td>5.424328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>200000147</td>\n",
" <td>130</td>\n",
" <td>130</td>\n",
" <td>1.000000</td>\n",
" <td>145729.199224</td>\n",
" <td>79260.8255</td>\n",
" <td>677492.4351</td>\n",
" <td>1.213963e+06</td>\n",
" <td>9338.178685</td>\n",
" <td>13868.197522</td>\n",
" <td>0.061164</td>\n",
" <td>0.022213</td>\n",
" <td>7585</td>\n",
" <td>11.889512</td>\n",
" <td>9.141974</td>\n",
" <td>8.330268</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_months n_active_months flow_freq aum_qty_mean \\\n",
"0 18872 130 130 1.000000 143505.697354 \n",
"1 200000076 130 119 0.915385 24141.541138 \n",
"2 200000082 130 130 1.000000 422994.464523 \n",
"3 200000146 130 130 1.000000 212108.397869 \n",
"4 200000147 130 130 1.000000 145729.199224 \n",
"\n",
" aum_qty_median net_flow_qty_sum gross_flow_qty_sum gross_flow_qty_mean \\\n",
"0 144653.1645 -45677.1480 1.244126e+06 9570.200015 \n",
"1 19888.8255 54791.9840 2.314415e+05 1780.319492 \n",
"2 462973.7880 178371.1590 2.327246e+06 17901.894469 \n",
"3 210616.5330 457533.3310 1.150546e+06 8850.350438 \n",
"4 79260.8255 677492.4351 1.213963e+06 9338.178685 \n",
"\n",
" net_flow_qty_vol rel_intensity netflow_to_aum n_tx_total \\\n",
"0 9832.357264 0.069449 -0.003918 1926 \n",
"1 2838.000232 0.083230 -0.000893 518 \n",
"2 13288.481111 0.047480 0.005194 7103 \n",
"3 10074.748210 0.051622 0.024910 4774 \n",
"4 13868.197522 0.061164 0.022213 7585 \n",
"\n",
" log_aum_qty_mean log_gross_flow_qty_mean gross_flow_to_aum \n",
"0 11.874137 9.166514 8.669523 \n",
"1 10.091731 7.485110 9.586858 \n",
"2 12.955117 9.792718 5.501836 \n",
"3 12.264857 9.088325 5.424328 \n",
"4 11.889512 9.141974 8.330268 "
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eps = 1e-9 \n",
"\n",
"# 1) Active month indicator: did the client trade this month?\n",
"df_month[\"active_month\"] = (df_month[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"#client avec beaucoup de mois à 0 → “stable / dormant”\n",
"#client actif presque tous les mois → “rebalancer / institutionnel actif”\n",
"\n",
"\n",
"# 2) Monthly relative intensity (turnover proxy in quantity terms) : Mesurer lintensité de trading relativement à la taille et pouvoir ocmparer client petit avec client plus gros\n",
"df_month[\"rel_intensity_m\"] = df_month[\"gross_flow_qty\"] / (df_month[\"aum_qty\"].abs() + eps)\n",
"\n",
"# 3) Monthly net flow ratio (directional change): sert a Capturer la direction de la dynamique\n",
"df_month[\"netflow_to_aum_m\"] = df_month[\"net_flow_qty\"] / (df_month[\"aum_qty\"].abs() + eps)\n",
"\n",
"# 4) Aggregate to client-level features (1 row per client)\n",
"df_client_feat = (\n",
" df_month.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" # Coverage / activity\n",
" n_months=(\"month\", \"nunique\"),\n",
" n_active_months=(\"active_month\", \"sum\"),\n",
" flow_freq=(\"active_month\", \"mean\"),\n",
"\n",
" # Size in quantity terms\n",
" aum_qty_mean=(\"aum_qty\", \"mean\"),\n",
" aum_qty_median=(\"aum_qty\", \"median\"),\n",
"\n",
" # Flows in quantity terms\n",
" net_flow_qty_sum=(\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty_sum=(\"gross_flow_qty\", \"sum\"),\n",
" gross_flow_qty_mean=(\"gross_flow_qty\", \"mean\"),\n",
"\n",
" # Dispersion / volatility proxy\n",
" net_flow_qty_vol=(\"net_flow_qty\", \"std\"),\n",
" rel_intensity=(\"rel_intensity_m\", \"mean\"),\n",
" netflow_to_aum=(\"netflow_to_aum_m\", \"mean\"),\n",
"\n",
" # Trading frequency proxy\n",
" n_tx_total=(\"n_tx\", \"sum\"),\n",
" )\n",
")\n",
"\n",
"# 5) Clean NaNs due to std on constant series\n",
"df_client_feat[\"net_flow_qty_vol\"] = df_client_feat[\"net_flow_qty_vol\"].fillna(0.0)\n",
"\n",
"# 6) Log transforms (useful because distributions are heavy-tailed)\n",
"df_client_feat[\"log_aum_qty_mean\"] = np.log1p(df_client_feat[\"aum_qty_mean\"].clip(lower=0))\n",
"df_client_feat[\"log_gross_flow_qty_mean\"] = np.log1p(df_client_feat[\"gross_flow_qty_mean\"].clip(lower=0))\n",
"\n",
"# 7) Global turnover proxy\n",
"df_client_feat[\"gross_flow_to_aum\"] = df_client_feat[\"gross_flow_qty_sum\"] / (df_client_feat[\"aum_qty_mean\"].abs() + eps)\n",
"\n",
"df_client_feat.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4ddd1305-fe5a-4d0f-b4d3-b07de27b5dc6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(431, 16)"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_client_feat.shape"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "34a37448-ab63-4fc1-8c93-f9f59db385c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered clients: (421, 16)\n"
]
}
],
"source": [
"dfc = df_client_feat.copy()\n",
"\n",
"# Minimal filters (adjust if needed)\n",
"dfc = dfc[(dfc[\"n_months\"] >= 6)] # at least 6 observed months\n",
"dfc = dfc[(dfc[\"aum_qty_mean\"].abs() > 0)] # avoid zero holdings\n",
"print(\"Filtered clients:\", dfc.shape)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "2763cc28-f9a7-4ced-8331-c2b79ac7c122",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q33: 1946.5229933948517 Q66: 8013.920450704226\n",
"seg_quantiles\n",
"High-flow 143\n",
"Low-flow 139\n",
"Intermediate-flow 139\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Baseline Clustering 1\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"\n",
"# Baseline 1 variable: average monthly gross traded quantity\n",
"x = dfc[\"gross_flow_qty_mean\"].copy()\n",
"\n",
"q33, q66 = x.quantile([0.33, 0.66])\n",
"\n",
"dfc[\"seg_quantiles\"] = pd.cut(\n",
" x,\n",
" bins=[-np.inf, q33, q66, np.inf],\n",
" labels=[\"Low-flow\", \"Intermediate-flow\", \"High-flow\"]\n",
")\n",
"\n",
"print(\"Q33:\", q33, \" Q66:\", q66)\n",
"print(dfc[\"seg_quantiles\"].value_counts(dropna=False))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "5afe137b-a09c-4fbc-a03c-b54f0422862d",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqQAAAHHCAYAAABpzkrAAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAaHZJREFUeJzt3XdYU+ffBvA7BAgbBJmKoLjALSqi4qSiolXr1ipYVytu62rrrBZrrVqt1da2av25rXvvjQv3AEVxVcENIsrK8/7hxXkNCco+BO/PdXFpnjw553tOkpM7ZzxRCCEEiIiIiIhkYiB3AURERET0cWMgJSIiIiJZMZASERERkawYSImIiIhIVgykRERERCQrBlIiIiIikhUDKRERERHJioGUiIiIiGTFQEpEREREssr3QDpp0iQoFIr8ng0AoHHjxmjcuLF0++DBg1AoFFi3bl2BzD84OBju7u4FMq+cSkhIQN++feHk5ASFQoFhw4bJXRIVkCVLlkChUODMmTMf7JvxvURvxcbGomPHjrCzs4NCocCcOXOk7czBgwflLi9LEhIS4ODggOXLl8tdSp5Jf23fvn07z6a5cOFClCpVCklJSXk2TV0UCgUmTZqU5b6DBg3K13oKs4zrKj+e98y4u7ujdevW+T6frMrrem7fvg2FQoElS5Z8sG9+ZZ1sBdL0Jz/9z8TEBC4uLggICMDcuXPx8uXLPCnqwYMHmDRpEs6fP58n08tLhbm2rPjhhx+wZMkSfPXVV1i2bBl69uwpd0mUx3777bcsbVQo+4YPH45du3Zh3LhxWLZsGVq0aCF3Sdn2yy+/wNLSEl27dpW7lGz74YcfsHHjxiz1ze37IDg4GMnJyfj9999zPI2cOH78OCZNmoQXL14U6HwLi+3bt2c5oOe1q1evYtKkSQUScElbjvaQTpkyBcuWLcOCBQswePBgAMCwYcNQpUoVXLx4UaPvd999h9evX2dr+g8ePMDkyZOzHfp2796N3bt3Z+sx2fW+2hYtWoTIyMh8nX9u7d+/H3Xr1sXEiRPx+eefw9vbW+6SKI8xkOaf/fv3o23btvj666/x+eefo2LFinKXlC0pKSn45Zdf0LdvXyiVSrnLybbMAmnPnj3x+vVruLm5SW25fR+YmJggKCgIs2bNghAix9P5kNevX+O7776Tbh8/fhyTJ0/+qAPp5MmTdd6XcV3ltatXr2Ly5MkMpDLJUSBt2bIlPv/8c/Tu3Rvjxo3Drl27sHfvXjx69AiffvqpRgA1NDSEiYlJnhWsS2JiIgDA2NgYxsbG+Tqv9zEyMoJKpZJt/lnx6NEj2NjYyF1GtqU/x/TxefXqldwlSPT1/ZNu69atePz4MTp37ix3KXlKqVTCxMQkz08P69y5M+7cuYMDBw7k6XTfZWJiAkNDw3ybflGir+uqMG3DCrM8O4e0adOmGD9+PO7cuYP//e9/Uruuc0j37NmDBg0awMbGBhYWFqhQoQK++eYbAG/P+6xduzYAoHfv3tLpAenfdBs3bozKlSsjPDwcDRs2hJmZmfTYzM57S0tLwzfffAMnJyeYm5vj008/xb179zT6uLu7Izg4WOux707zQ7XpOq/i1atXGDlyJFxdXaFSqVChQgXMnDlT6xt3+rlBGzduROXKlaFSqVCpUiXs3LlT9wrP4NGjR+jTpw8cHR1hYmKCatWqYenSpdL96ee5RUdHY9u2bVLt7/smuHjxYjRt2hQODg5QqVTw8vLCggULNPq0bt0aZcqU0fl4X19f1KpVS6Ptf//7H7y9vWFqagpbW1t07dpV67l433O8adMmBAYGwsXFBSqVCh4eHvj++++RlpamNf/58+ejTJkyMDU1RZ06dXDkyBGdr5GkpCRMnDgRZcuWhUqlgqurK0aPHp2lc8fSa7148SIaNWoEMzMzlC1bVjpv+dChQ/Dx8YGpqSkqVKiAvXv3ak3j3LlzaNmyJaysrGBhYYFmzZrhxIkTGn3ST5c5duwYRowYAXt7e5ibm6N9+/Z4/Pix1M/d3R1XrlzBoUOHpOdY1/K+bxoZJSQkwNzcHEOHDtW67/79+1AqlQgNDX3venr69Cl69uwJKysr2NjYICgoCBcuXNA6Zyk4OBgWFha4efMmWrVqBUtLS/To0QNA1t9L79u+pJs3bx4qVaoEMzMzFCtWDLVq1cKKFSsyrT99/QshMH/+fGndvs/atWul13rx4sXx+eef47///pPu37x5MxQKhcZRpX///RcKhQKfffaZxrQ8PT3RpUuXbC2jLhs3boS7uzs8PDx03le5cmWYmJigcuXK2LBhg9Y2LbPzZXWdf3bx4kUEBwejTJkyMDExgZOTE7744gs8ffpU47HpnxFRUVEIDg6GjY0NrK2t0bt3b40vogqFAq9evcLSpUul9Z++zc54LmFm74Nbt25BoVBg9uzZWst//PhxKBQKrFy5Umrz9vaGra0tNm3a9N71OnfuXCiVSo29mj///DMUCgVGjBghtaWlpcHS0hJjxozRWK70Q9STJk3CqFGjAAClS5fOdDud08+J+/fvo127djA3N4eDg4N0CkrG5zQrn4cAkJycjAkTJsDb2xvW1tYwNzeHn5+fVoBPf33MnDkTf/zxBzw8PKBSqVC7dm2cPn1a6hccHIz58+dL6yXj+yyr59vu2LEDfn5+MDc3h6WlJQIDA3HlypX3PmbJkiXo1KkTAKBJkybSvDO+1o8ePYo6derAxMQEZcqUwT///KM1HYVCgUOHDmHgwIFwcHBAyZIls1VbTEwMevfujZIlS0KlUsHZ2Rlt27bV+Xn9oXoA4NatW+jUqRNsbW1hZmaGunXrYtu2be9dH+l0bRd0WbVqFby9vWFpaQkrKytUqVIFv/zyS5bmkS5Pv2r07NkT33zzDXbv3o1+/frp7HPlyhW0bt0aVatWxZQpU6BSqRAVFYVjx44BeLvhnTJlCiZMmID+/fvDz88PAFCvXj1pGk+fPkXLli3RtWtXfP7553B0dHxvXdOmTYNCocCYMWPw6NEjzJkzB/7+/jh//jxMTU2zvHxZqe1dQgh8+umnOHDgAPr06YPq1atj165dGDVqFP777z+tjeLRo0exfv16DBw4EJaWlpg7dy46dOiAu3fvws7OLtO6Xr9+jcaNGyMqKgqDBg1C6dKlsXbtWgQHB+PFixcYOnQoPD09sWzZMgwfPhwlS5bEyJEjAQD29vaZTnfBggWoVKkSPv30UxgaGmLLli0YOHAg1Go1QkJCAABdunRBr169cPr0aSmsA8CdO3dw4sQJ/PTTT1LbtGnTMH78eHTu3Bl9+/bF48ePMW/ePDRs2BDnzp3T2POU2XO8ZMkSWFhYYMSIEbCwsMD+/fsxYcIExMfHa8xrwYIFGDRoEPz8/DB8+HDcvn0b7dq1Q7FixTQ2Dmq1Gp9++imOHj2K/v37w9PTE5cuXcLs2bNx/fr1LJ2v9vz5c7Ru3Rpdu3ZFp06dsGDBAnTt2hXLly/HsGHD8OWXX6J79+746aef0LFjR9y7dw+WlpYA3r4f/Pz8YGVlhdGjR8PIyAi///47GjduLIXZdw0ePBjFihXDxIkTcfv2bcyZMweDBg3C6tWrAQBz5szB4MGDYWFhgW+//RYAtN4fH5pGRhYWFmjfvj1Wr16NWbNmaRzqXblyJYQQUmjURa1Wo02bNjh16hS++uorVKxYEZs2bUJQUJDO/qmpqQgICECDBg0wc+ZMmJmZZfm99KHtC/D21JohQ4agY8eOGDp0KN68eYOLFy/i5MmT6N69u86aGjZsKJ1z/cknn6BXr16ZLi/w9nXau3dv1K5dG6GhoYiNjcUvv/yCY8eOSa/1Bg0aQKFQ4PDhw6hatSoA4MiRIzAwMMDRo0elaT1+/BgRERHSxSxZWcbMHD9+HDVr1tRq3717Nzp06AAvLy+Ehobi6dOn0gdiTu3Zswe3bt1C79694eTkhCtXruCPP/7AlStXcOLECa1A37lzZ5QuXRqhoaE4e/Ys/vzzTzg4OODHH38EACxbtgx9+/ZFnTp10L9/fwDQGayBzN8HZcqUQf369bF8+XIMHz5c4zHLly+
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"plt.hist(dfc[\"gross_flow_qty_mean\"], bins=100)\n",
"plt.axvline(q33, linestyle=\"--\")\n",
"plt.axvline(q66, linestyle=\"--\")\n",
"plt.xlabel(\"Average monthly gross flow (quantity)\")\n",
"plt.ylabel(\"Count\")\n",
"plt.title(\"Distribution of average monthly gross flows (quantity) with quantile thresholds\")\n",
"plt.show()\n",
"\n",
"#X= activite moyenen mensuelle , Y = combien de client ont cette valeurde X"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "619559a8-c205-4e25-a810-4d80894644c2",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAUxBJREFUeJzt3XdYFFfbBvB76UgHqdKs2HvsWImIaNTYJQrGFntJYjTGHks0MUZjLHnf2KKxJLEk2BvW2BB7D9gRK0gRKef7g5f5HFhgwYVh9P5d117XztnZM8/Ozsw+e86ZGY0QQoCIiIhIhQyUDoCIiIiooJjIEBERkWoxkSEiIiLVYiJDREREqsVEhoiIiFSLiQwRERGpFhMZIiIiUi0mMkRERKRaTGSIiIhItZjIqIRGo8GUKVOk6RUrVkCj0SAqKqrQlx0SEgJvb29pOioqChqNBt9++22hLxsApkyZAo1GUyTLKqjU1FSMHTsWHh4eMDAwQMeOHZUOiYrIgQMHoNFo8Pvvv+c5b9Z9iTLEx8ejf//+cHFxgUajwahRo6TjzIoVK5QOTyfp6emoWrUqZsyYoXQoepO5bR84cEBvde7YsQOWlpZ49OiR3up85xOZzITg1KlTSodS6BITEzFlyhS9bpT6Upxj08Uvv/yCuXPnokuXLli5ciVGjx6tdEikZ2vXrsX8+fOVDuOtNHPmTKxYsQKDBw/G6tWr0bt3b6VDyrfffvsNd+7cwbBhw5QOJd9++uknnRPGN90P2rRpg3LlymHWrFkFriMb8Y5bvny5ACBOnjypdCi5AiAmT54sTaempoqkpCSRnp6ucx2PHj3KVo8uXr16JV6+fClNR0ZGCgBi7ty5+aqnoLGlpKSIpKQkvS2rMHTv3l2UKlVK6TCoEAUGBgovL69s5fv37xcAxMaNG/OsIzg4WGsd77r69euLxo0by8oyjzPLly9XJqh8qlGjhhg4cKDSYRRIlSpVRLNmzbKVp6WliaSkJJGWliaV5bQf5MdPP/0kSpQoIeLi4t6onkzvfIuMWhkaGsLMzKxQu1wSEhIAAMbGxjA1NS205eTFyMgIZmZmii1fFzExMbC1tVU6jHzL/I7p3VOcvnu17j+Zzpw5g7Nnz6Jbt25Kh6JXBgYGMDMzg4GBflOFzp07Izk5GRs3btRLfUxkdHTmzBkEBATA2toalpaWaNWqFf75559s8507dw7NmjWDubk53N3d8fXXX2P58uU6j2dJTk7G6NGj4ejoCCsrK3zwwQe4e/dutvm0jZE5deoU/P39UbJkSZibm6N06dL4+OOPAWSMa3F0dAQATJ06FRqNRjbuJiQkBJaWlrh58ybatm0LKysrBAUFSa/l1K///fffw8vLC+bm5mjWrBkuXLgge7158+Zo3rx5tve9XmdesWkbI5Oamorp06ejbNmyMDU1hbe3N7788kskJyfL5vP29ka7du1w+PBh1KtXD2ZmZihTpgxWrVql9fNklZCQgE8//RQeHh4wNTWFj48Pvv32W4j/3TQ+sx9///79uHjxohR7bl1kW7ZsQWBgINzc3GBqaoqyZcti+vTpSEtLk+YZNmwYLC0tkZiYmO39PXv2hIuLi2z+7du3w9fXFxYWFrCyskJgYCAuXrwoe19u3/GhQ4fQtWtXeHp6wtTUFB4eHhg9ejSSkpKyLX/jxo2oXLkyzMzMULVqVWzatEnrNpKeno758+ejSpUqMDMzg7OzMwYNGoRnz57lud4zY719+zbatWsHS0tLlCpVCosWLQIAnD9/Hi1btoSFhQW8vLywdu3abHX8+++/6Nq1K+zt7VGiRAk0aNAAoaGhsnkyxwBs2LABM2bMgLu7O8zMzNCqVSvcuHFDmq958+YIDQ3FrVu3pO9Y2+fNrY6shBDw9vZGhw4dsr328uVL2NjYYNCgQbmup6SkJIwYMQIlS5aUjhf37t3LNqYucx+6dOkSevXqBTs7OzRp0gSA7vtSbseXTOvWrUOdOnVgZWUFa2trVKtWDT/88EOO8Weu/8jISISGhkrrNrdj5b59+6Rt3dbWFh06dMDly5el18+dOweNRoOtW7dKZadPn4ZGo0Ht2rVldQUEBKB+/fr5+ozabN68GSYmJmjatGm21w4fPoz33nsPZmZmKFu2LJYuXZrtmJbbeKCs3+WtW7cwZMgQ+Pj4wNzcHA4ODujatWu2dZb5G3HkyBGMGTMGjo6OsLCwQKdOnWTjU7y9vXHx4kWEhYVJ6z/zmJ11jExO+0F8fDwsLCwwcuTIbPHfvXsXhoaGsq4kJycnVK9eHVu2bMlz3erCSC+1vOUuXrwIX19fWFtbY+zYsTA2NsbSpUvRvHlzhIWFSTvCvXv30KJFC2g0GowfPx4WFhb4z3/+k6/WjP79++PXX39Fr1690KhRI+zbtw+BgYF5vi8mJgatW7eGo6Mjxo0bB1tbW0RFReHPP/8EADg6OmLx4sUYPHgwOnXqhA8//BAAUL16damO1NRU+Pv7o0mTJvj2229RokSJXJe5atUqvHjxAkOHDsXLly/xww8/oGXLljh//jycnZ11/sy6xJZV//79sXLlSnTp0gWffvopjh8/jlmzZuHy5cvYtGmTbN4bN26gS5cu6NevH4KDg/HLL78gJCQEderUQZUqVXJchhACH3zwAfbv349+/fqhZs2a2LlzJz7//HPcu3cP33//PRwdHbF69WrMmDED8fHx0s5aqVKlHOtdsWIFLC0tMWbMGFhaWmLfvn2YNGkS4uLiMHfuXABA9+7dsWjRIoSGhqJr167SexMTE/HXX38hJCQEhoaGAIDVq1cjODgY/v7++Oabb5CYmIjFixejSZMmOHPmjOwHN6fveOPGjUhMTMTgwYPh4OCAEydOYOHChbh7967sX1NoaCi6d++OatWqYdasWXj27Bn69euHUqVKZfucgwYNwooVK9C3b1+MGDECkZGR+PHHH3HmzBkcOXIExsbGOa4jAEhLS0NAQACaNm2KOXPmYM2aNRg2bBgsLCwwYcIEBAUF4cMPP8SSJUvQp08fNGzYEKVLlwYAPHz4EI0aNUJiYiJGjBgBBwcHrFy5Eh988AF+//13dOrUSbas2bNnw8DAAJ999hliY2MxZ84cBAUF4fjx4wCACRMmIDY2Fnfv3sX3338PALC0tMxXHVlpNBp89NFHmDNnDp4+fQp7e3vptb/++gtxcXH46KOPcl1HISEh2LBhA3r37o0GDRogLCws1+NF165dUb58ecycOVNKxnXZl/I6vgDA7t270bNnT7Rq1QrffPMNAODy5cs4cuSI1h84IGM/Wb16NUaPHg13d3d8+umnADKOCdoGg+7ZswcBAQEoU6YMpkyZgqSkJCxcuBCNGzdGeHg4vL29UbVqVdja2uLgwYP44IMPAGQk6gYGBjh79izi4uJgbW2N9PR0HD16FAMHDtT5M+bk6NGjqFq1arZt+vz581KdU6ZMQWpqKiZPnpyv42NWJ0+exNGjR9GjRw+4u7sjKioKixcvRvPmzXHp0qVsx+3hw4fDzs4OkydPRlRUFObPn49hw4Zh/fr1AID58+dj+PDhsLS0xIQJEwAgx/hy2g8sLS3RqVMnrF+/HvPmzZOOTUDG2CEhhPSnKVOdOnWwefPmAq8HGb10UKmYLmNkOnbsKExMTMTNmzelsvv37wsrKyvRtGlTqWz48OFCo9GIM2fOSGVPnjwR9vb2AoCIjIzMNZaIiAgBQAwZMkRW3qtXr2zjRzLjzqxz06ZNeX6O3MahBAcHCwBi3LhxWl97vU80s+/a3Nxc3L17Vyo/fvy4ACBGjx4tlTVr1kxr32vWOnOLbfLkyeL1TTVzPfXv318232effSYAiH379kllXl5eAoA4ePCgVBYTEyNMTU3Fp59+mm1Zr9u8ebMAIL7++mtZeZcuXYRGoxE3btyQfc4qVarkWl+mxMTEbGWDBg0SJUqUkMYipaeni1KlSonOnTvL5tuwYYPs87x48ULY2tqKAQMGyOaLjo4WNjY2svLcvmNtMc2aNUtoNBpx69YtqaxatWrC3d1dvHjxQio7cOCAACD7Pg8dOiQAiDV
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"plt.hist(np.log1p(dfc[\"gross_flow_qty_mean\"]), bins=100)\n",
"plt.axvline(np.log1p(q33), linestyle=\"--\")\n",
"plt.axvline(np.log1p(q66), linestyle=\"--\")\n",
"plt.xlabel(\"log(1 + avg monthly gross flow) (quantity)\")\n",
"plt.ylabel(\"Count\")\n",
"plt.title(\"Log-distribution of average monthly gross flows (quantity)\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "af32c07d-3908-428a-b388-a6e1a8d528bd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1011/630006569.py:3: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.\n",
" plt.boxplot(\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAGzCAYAAABwyVA7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAakZJREFUeJzt3XdcU1f/B/BPQEFAhoIMJ4gDUOpAHCAq1Rapi+Leo46nolZxVG2rtrXSuhdqnfQRV6VIrRatdVRU2iqOSgU31SqCi6UokpzfH/7IYwQkgYQk5PN+vXhpzj2593uTm5tvzr3nHIkQQoCIiIiIdJKRtgMgIiIiouIxWSMiIiLSYUzWiIiIiHQYkzUiIiIiHcZkjYiIiEiHMVkjIiIi0mFM1oiIiIh0GJM1IiIiIh3GZI2IiIhIh2klWXN2dsaIESPkj48dOwaJRIJjx45pI5xSi4iIgEQiQUpKirZDwYgRI+Ds7KxQJpFIMG/ePIWy06dPw8fHBxYWFpBIJDh//jwA4MCBA2jevDmqVKkCiUSCjIyMcolb3V4/topTlvdu3rx5kEgkqgenRikpKZBIJIiIiFC67uLFi0usq4l9S0tLQ58+fWBrawuJRILly5fr1Ge+qM+JvlL2+Cfd995772HMmDHaDkNtVDlnVUQvXrxAnTp1sGbNmlI9X63J2vXr1zFu3DjUr18fVapUgZWVFXx9fbFixQrk5uaqc1OlsmDBAsTExGg7DK158eIF+vbti0ePHmHZsmXYunUr6tWrh4cPH6Jfv34wMzNDeHg4tm7dCgsLC22HSyr6+eefdTLpmDJlCg4ePIhZs2Zh69at6Nq1q7ZDUhtdP6dcunQJ8+bN04kflBXR3bt3MW/ePPmPXnU5efIkfvnlF3z88cdqXW952L59O5YvX65UXU2fs6RSKWrWrAmJRILY2Ngi64wYMQJVq1Ytdh1Vq1YtsnFJIpEgMjKyyOf4+vpCIpGgadOm8rLKlSsjNDQUX331FZ49e6b6zgg12bdvnzAzMxM2NjZi0qRJYv369WL16tViwIABonLlymLMmDHyuvXq1RPDhw+XP5ZKpSI3N1dIpVJ1hVMkCwsLhe2WVX5+vsjNzRUymUxt6yyt4cOHi3r16imU5ebmihcvXsgfJyUlCQBiw4YNCvViY2MFAHHo0KHyCFWjnj17JvLy8kqst2XLFgFA3Lx5U+VtzJ07V6jxo1MqMplM5Obmivz8fHlZSEhIkXHdvHlTABCLFi0qcb0vXrwQubm5ao3VwcFBDB48WKHs6NGjAoA4evSoWrdVGq9/TlSh7nNKWb1+/O/evVtnXueK6PTp0wKA2LJli1rX26tXL/Huu++qdZ3lpVu3boW+i4RQ7ZylLr/88osAIJydnQudgwoMHz5cWFhYFLuO1z/jBeeuKlWqiMDAwEL1C863VapUEU2aNFFY9vjxY2FiYiI2bdqk8r5UUj29K+zmzZsYMGAA6tWrhyNHjsDJyUm+LCQkBNeuXcP+/fuLfb6RkRGqVKmijlDKlbGxMYyNjbUdRrFef03T09MBADY2NkqVl8WTJ0+00jpnampa7tvUBolEopHPTKVKlVCpklpOC3Lp6elqPbbUTR/PPcUxlOO/IktPT8f+/fuxbt06bYeiVpo6Z71JZGQkWrZsieHDh2P27Nlq/V567733sHfvXjx48AB2dnby8u3bt8PBwQENGzbE48ePFZ5jY2ODd999FxERERg1apRqG1Q5vSvCf/7zHwFAnDx5Uqn6r7esFfcr+/fffxcBAQHCyspKmJmZiQ4dOogTJ04o1Clo5bh69aoYPny4sLa2FlZWVmLEiBHiyZMn8noACv2V9It45cqVwsPDQ95i6OXlJbZt2yZf/nrrTEEsJW1LKpWKZcuWCQ8PD2Fqairs7e3F2LFjxaNHj5R6/fbs2SOaNGkiTE1NRZMmTUR0dHSRLWsAxNy5c4UQL389vB5Tx44dRceOHd8Yqyrvwd9//y0GDhwobGxsRPPmzYUQQqSmpooRI0aIWrVqCRMTE+Ho6Ch69uz5xhatH3/8UQAQFy5ckJdFRUUJAOL9999XqOvm5ib69esnf/z6sSWEEImJicLf319UqVJF1KpVS3z55Zdi06ZNRbas/fzzz6J9+/bC3NxcVK1aVbz33nsiMTGxyP0t0KFDB/HWW28VuS+NGjV64y/kKVOmiOrVqyu0zk6YMEEAECtWrJCX3bt3TwAQa9asEUL879dbwS/6ot7fghhfbVn79ttvRf369YWJiYlo1aqV+PPPP9+4b0K8PI5CQkLkx52JiYnw8PAQsbGxxe6XEP/7fBQVU3Gf+e+//160bNlSVKlSRdja2orBgweLf//9V768LMdGcV79nLz6GpT1nPLvv/+KkSNHCnt7e/lr9vov6oLXYdeuXWL+/PmiVq1awtTUVLz99tvi6tWrCnWvXLkigoODhYODgzA1NRW1atUS/fv3FxkZGfI6rx7/xb3+R48eFcOGDRO2trZFtkK/8847olGjRm98zZSJRQghtm7dKn8/q1WrJvr37y9u3bpVaH2rV68WLi4uokqVKsLb21scP35cfn4q6rWaN2+eqFmzpqhataro3bu3yMjIEM+ePRMfffSRqFGjhrCwsBAjRowQz549K7QtZWLq2LGjaNKkifj7779Fp06dhJmZmahZs6b45ptvCsXz+l/BZ1LZ1+h1mzdvFgBESkpKoWXKnsteP6YLvH5+fPjwoZg6dapo2rSpsLCwEJaWlqJr167i/PnzCs9T9jgt6vuk4HtJ2XOWTCYT9erVEz179iwUf25urrCyshJjx45942sohBBPnz4VlpaWYuHChSI1NVUYGRkpfH8XKG3L2nfffScsLCzk5+QCTZo0ERMnTpQfQ69bsWKFkEgk4uHDhyXuw6vU8hP6p59+Qv369eHj46OO1QEAjhw5gsDAQHh5eWHu3LkwMjLCli1b8PbbbyMuLg6tW7dWqN+vXz+4uLggLCwMZ8+excaNG2Fvb49vvvkGALB161aMHj0arVu3xtixYwEArq6uxW5/w4YNmDRpEvr06YOPPvoIz549w19//YU//vgDgwYNKvI5wcHBaNCggUJZQkICli9fDnt7e3nZuHHjEBERgZEjR2LSpEm4efMmVq9ejXPnzuHkyZOoXLlysXH98ssv6N27Nzw8PBAWFoaHDx9i5MiRqF279htfz3HjxqFWrVpYsGABJk2aBG9vbzg4OAAAGjdujPXr1+OLL76Ai4uL/HVR9T3o27cvGjZsiAULFkAIAQDo3bs3/v77b0ycOBHOzs5IT0/HoUOHcOvWrUIdIgq0b98eEokEx48fx1tvvQUAiIuLg5GREU6cOCGvd//+fSQnJ2PChAnF7ve9e/fg7++P/Px8zJw5ExYWFli/fj3MzMwK1d26dSuGDx+OgIAAfPPNN3j69CnWrl2L9u3b49y5c8XGO3ToUIwZMwaJiYkK9yicPn0aV65cwaefflpsfH5+fli2bBn+/vtv+XML9jUuLg6TJk2SlwFAhw4dilzPuHHjcPfuXRw6dAhbt24tss727duRnZ2NcePGQSKRYOHChQgODsaNGzfeeMwBwIkTJxAdHY3x48fD0tISK1euRO/evXHr1i3Y2toW+ZwOHTpg69atGDp0KN555x0MGzbsjdso+Ex4e3sjLCwMaWlpWLFiBU6ePIlz587BxsZGrcdGScpyTklLS0Pbtm0hkUgwYcIE1KhRA7Gxsfjggw+QlZWFyZMnK2zr66+/hpGREaZNm4bMzEwsXLgQgwcPxh9//AEAyMvLQ0BAAJ4/f46JEyfC0dERd+7cwb59+5CRkQFra+tC8Xfo0AGTJk3CypUrMXv2bLi7uwMA3N3dMXToUPz3v//FwYMH0b17d/lz7t27hyNHjmDu3LnFvi7KxvLVV1/hs88+Q79+/TB69Gjcv38fq1atQocOHeTvJwCsXbsWEyZMgJ+fH6ZMmYKUlBQEBQWhWrVqRZ7XwsLCYGZmhpkzZ+LatWtYtWoVKleuDCMjIzx+/Bjz5s3D77//joiICLi4uGDOnDny5yobEwA8fvwYXbt
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#limite client size \n",
"plt.figure()\n",
"plt.boxplot(\n",
" [dfc.loc[dfc[\"seg_quantiles\"]==s, \"aum_qty_mean\"].dropna()\n",
" for s in [\"Low-flow\",\"Intermediate-flow\",\"High-flow\"]],\n",
" labels=[\"Low\",\"Mid\",\"High\"]\n",
")\n",
"plt.yscale(\"log\")\n",
"plt.ylabel(\"Mean AUM (quantity) [log scale]\")\n",
"plt.title(\"Client size differs widely within flow-intensity segments (quantity AUM)\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "0596d9fe-524a-493a-948a-69f37075d1ca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 421.000000\n",
"mean 80.643705\n",
"std 37.155098\n",
"min 7.000000\n",
"25% 52.000000\n",
"50% 71.000000\n",
"75% 130.000000\n",
"max 130.000000\n",
"Name: n_months, dtype: float64"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Baseline 2\n",
"\n",
"\n",
"\n",
"\n",
"dfc[\"n_months\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "56ac17f8-a25f-4726-a7ac-103a35ac6d6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"seg_2D\n",
"Highly active (high int, high freq) 109\n",
"Small rebalancers (low int, high freq) 106\n",
"Dormant (low int, low freq) 104\n",
"Occasional large movers (high int, low freq) 102\n",
"Name: count, dtype: int64\n",
"thr_int: 4.174084305917157 thr_freq: 0.9859154929577465\n"
]
}
],
"source": [
"dfc[\"rel_intensity_total\"] = dfc[\"gross_flow_to_aum\"] # turnover proxy\n",
"dfc[\"frequency\"] = dfc[\"flow_freq\"] # share of active months\n",
"\n",
"# Thresholds: medians (simple + explainable)\n",
"thr_int = dfc[\"rel_intensity_total\"].median()\n",
"thr_freq = dfc[\"frequency\"].median()\n",
"thr_tx = dfc[\"n_tx_total\"].median()\n",
"\n",
"def quadrant(row):\n",
" low_int = row[\"rel_intensity_total\"] < thr_int\n",
" low_frq = row[\"frequency\"] < thr_freq\n",
"\n",
" if low_int and low_frq:\n",
" return \"Dormant (low int, low freq)\"\n",
" if low_int and (not low_frq):\n",
" return \"Small rebalancers (low int, high freq)\"\n",
" if (not low_int) and low_frq:\n",
" return \"Occasional large movers (high int, low freq)\"\n",
" return \"Highly active (high int, high freq)\"\n",
"\n",
"dfc[\"seg_2D\"] = dfc.apply(quadrant, axis=1)\n",
"\n",
"print(dfc[\"seg_2D\"].value_counts())\n",
"print(\"thr_int:\", thr_int, \"thr_freq:\", thr_freq)\n"
]
},
{
"cell_type": "code",
"execution_count": 196,
"id": "8635328a-fb66-45cb-b0fa-d1dc6c3cc1e1",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApgAAAHHCAYAAAAbASh2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA3ENJREFUeJzs3XdYFMf/B/D3Hb33KiCCFZUmaDQqqCg27DVGscavYuyJGhNrjC22KBGNNcYoNtQYO7FrrIi9o6CCiggoKOVufn/wuwvH3cEd3F6Bz+t5ePT25nZm9/Z2PzszO8NjjDEQQgghhBCiInxNF4AQQgghhFQuFGASQgghhBCVogCTEEIIIYSoFAWYhBBCCCFEpSjAJIQQQgghKkUBJiGEEEIIUSkKMAkhhBBCiEpRgEkIIYQQQlSKAkxCCCGEEKJSWh9ghoaGokGDBipdJ4/Hw5gxY1S6TkVs2rQJPB4PT58+VXveADB48GB4enpqJG8i6enTp+DxeNi0aZOmi6IyXG2Tp6cnBg8erNJ1anO+XAgNDUVoaKimi6ESW7ZsQd26dWFgYABra2tNF6dSSklJgbGxMc6dO6fpohANO3z4MMzNzfHmzRulP6tUgHn58mWMGTMG9evXh5mZGTw8PNCnTx88ePBAKm1oaCh4PB54PB74fD4sLS1Rp04dDBw4EMeOHVO6oISo2507dzBr1qwK3RD8+eefWL58ucrKVFmdP38es2bNQmZmpqaLohK//vqrVt88vHz5ErNmzcL169c1XRSl3Lt3D4MHD4a3tzd+++03rF27VtNFqpTmzJmDJk2a4PPPP5ebpm3btqVW1qxfvx716tWDsbExatWqhZUrV8pM9+LFC/Tp0wfW1tawtLRE165d8eTJE5VsR1kyMzNhbGwMHo+Hu3fvqiVPTTh48CBmzZpVrs+2b98eNWvWxPz585X/MFNCz549mbOzM/v666/Zb7/9xubOncucnJyYmZkZu3nzpkTakJAQ5ubmxrZs2cK2bNnCYmJi2OTJk5mXlxcDwPr06cPy8/PLzDMkJITVr19fmWKWCQCLiopS6ToVUVhYyD5+/MiEQqHa82aMscjISFa9enWN5K2Ldu7cyQCwEydOlHsdnTp1krnPhUIh+/jxIyssLCx/AbVMUlISA8A2btyo9GcXL17MALCkpCSp9z59+qTQuULVKpJv/fr1WUhIiGoLVAF5eXksLy9P/Pry5cvl/q40afXq1QwAe/jwoaaLUmm9fv2aGRgYsD///FNumt27dzMzMzO519KYmBgGgPXs2ZOtXbuWDRw4kAFgCxYskEj3/v17VqtWLebo6MgWLlzIli5dytzd3ZmbmxtLT09X+baVtHbtWmZsbMycnZ3Z9OnTOc9PU6KiopiS4Z6EX3/9lZmamrLs7GylPqdUDebEiRPx7Nkz/PLLLxg+fDi+//57nDlzBoWFhViwYIFUeisrK3z55Zf48ssvMXLkSCxevBgPHjzA6NGjsWPHDnz//ffKR8Q6TE9PT3y3pAq5ubkqWQ9RPx6PB2NjY+jp6Wm6KHIJhUJ8+vRJ08WAkZERDAwMqky+XDA0NIShoaGmi1Fhr1+/BoAym8YZY/j48aMaSlT5/PHHH9DX10dERITM9z99+oRJkyZhypQpMt//+PEjpk+fjk6dOmHXrl0YMWIEfv/9dwwYMABz587Fu3fvxGl//fVXPHz4EAcOHMC3336LCRMm4OjRo0hNTcWSJUtKLeesWbMq3OXrjz/+QMeOHdG/f3/8+eefFVpXZdazZ0/k5eVh586dyn2w3CFtMYGBgSwwMFBiWWk1j4WFhczHx4eZmpqyzMzMUtctWs+VK1dY06ZNmbGxMfP09GSrV6+WSvvp0yc2Y8YM5u3tzQwNDZmbmxv75ptv2KdPnyTS4f/vuuLi4lj9+vWZoaEh8/HxYYcOHZJI9/TpUzZq1ChWu3ZtZmxszGxtbVmvXr0kallENQGbNm2SKs/hw4cZAPbXX38xxhjbuHGjzFqa6Oho5uPjwwwNDZmLiwsbPXo0e/fundz90KJFC2ZiYsLGjRvHGGNs7969rGPHjszFxYUZGhoyLy8vNmfOHKnaMUVrMC9fvszatWvH7OzsxPt7yJAhEmkEAgFbtmwZ8/HxYUZGRszR0ZF99dVXLCMjQyrdzJkzmYuLCzMxMWGhoaHs9u3brHr16iwyMlKcTrRvzpw5w77++mtmb2/PrKys2FdffcXy8vLYu3fv2MCBA5m1tTWztrZm33zzjVRNsKJlql69OuvUqRM7c+YMCw4OZkZGRqxGjRps8+bNUuUp+SeqzVRkn4eEhEh9XrT/5dX2xcfHs+bNmzNTU1NmZWXFunTpwu7cuSORZubMmeJanMjISGZlZcUsLS3Z4MGDWU5OjkTaN2/esLt370otl0X0u/jjjz+Yj48P09fXZ3FxcYwxxp4/f86GDBnCHB0dxb+X9evXS3xe1jYlJiayyMhIVqNGDWZkZMScnJzYkCFDJGonRNtT8k/0Oyl+rCjze1O03PLIO0bPnj3LJkyYwOzt7ZmpqSnr1q0be/36tcTnSm5L8drMd+/esXHjxjE3NzdmaGjIvL292YIFC5hAIJDal4sXL2Zr1qxhXl5ezNDQkAUFBbFLly5JlDM1NZUNHjyYVatWjRkaGjJnZ2fWpUsXifNMSEiIuAwnTpyQub83btzIZsyYwfT19SW2R2TEiBHMysqKffz4Ueb+EtVCP336VOq9qVOnMgMDA/Fv8cGDB6xHjx7MycmJGRkZsWrVqrG+ffuWej2QtV9nzpwpfq9Tp07s8OHDrFGjRszIyIgtW7ZM4f0tShcZGcksLS2ZlZUVGzRoEEtISJA6povvy+JknV9VeU4qXs7x48ez6tWrM0NDQ1atWjU2cOBA9ubNG/b+/XtmamrKxo4dK/W5lJQUxufz2U8//SR3HzPGWMuWLVloaKjc92fPns08PDxYbm6uzBrMv//+mwFgf//9t8Ty8+fPMwBsy5Yt4mXBwcEsODhYKo927doxb2/vUss5c+bMCrXIPXv2jPF4PLZjxw528eJFBoCdO3dOZtotW7aw4OBgZmJiwqytrVmLFi3YkSNHJNIcPHiQtWzZkpmbmzMLCwsWFBTEtm7dKpFmx44dLDAwkBkbGzM7Ozs2YMAA9vz5c4k0ih5fip4jIiMjZf7eRbZt28YCAwPF5W7QoAFbvny5VP4BAQGsS5cucvenLBUOMIVCIatWrRpr166dxPKymrbnzp3LALADBw6Uuv6QkBDm6urKHB0d2ZgxY9gvv/zCmjdvzgBIXCgEAgFr164dMzU1ZePHj2dr1qxhY8aMYfr6+qxr164S6wTA/Pz8mIuLC5s7dy5bvnw58/LyYqamphIXvp07dzI/Pz82Y8YMtnbtWvbdd98xGxsbVr16dYkLtpeXF+vYsaNU2YcMGcJsbGzEzWyyAkzRxTUsLIytXLmSjRkzhunp6bHg4GCJ5rmQkBDm7OzMHBwc2Ndff83WrFnD9u7dyxhjrFu3bqxPnz5s8eLFbPXq1ax3794MAJs8ebJEeRQJMF+9esVsbGxY7dq12eLFi9lvv/3Gpk+fzurVqyeRbvjw4UxfX5+NGDGCxcTEsClTpjAzMzOpcn/77bcMAIuIiGCrVq1iI0aMYG5ubsze3l7mxdvf35+1b9+eRUdHi5tVvv32W9a8eXP2xRdfsF9//ZV17tyZAZA6+SpapurVq7M6deowJycn9t1337FVq1axwMBAxuPx2K1btxhjjD1+/JiNHTuWAWDfffeduKtHWlqawvv86NGjzN/fn9nb24s/LwrYZAVjx44dY/r6+qx27dps0aJFbPbs2cze3p7Z2NjIPGYCAgJYjx492K+//sqGDx8u3lfFidIq0swPgNWrV485ODiw2bNns+joaJaQkMDS0tKYm5sbc3d3Z3PmzGGrV69mXbp0YQDEF3F52/Tzzz+zFi1asDlz5rC1a9eycePGMRMTE9a4cWPxDUJiYiLr37+/eH2iffXhwwfx91X8WFH096ZoueWRF2AGBASw1q1bs5UrV7JJkyYxPT091qdPH3G6uLg
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"for name, g in dfc.groupby(\"seg_2D\"):\n",
" plt.scatter(g[\"frequency\"], g[\"rel_intensity_total\"], s=10, label=name)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.axvline(thr_freq, linestyle=\"--\")\n",
"plt.axhline(thr_int, linestyle=\"--\")\n",
"plt.xlabel(\"Activity frequency (share of active months)\")\n",
"plt.ylabel(\"Gross flow / mean AUM (quantity) [log scale]\")\n",
"plt.title(\"2D behavioral segmentation: relative intensity vs frequency (400+ Accounts)\")\n",
"plt.legend(markerscale=2)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "9bf72f4b-95ac-4233-929b-47f6f101db49",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_clients</th>\n",
" <th>aum_qty_med</th>\n",
" <th>gross_flow_qty_med</th>\n",
" <th>freq_med</th>\n",
" <th>rel_int_med</th>\n",
" <th>n_tx_med</th>\n",
" </tr>\n",
" <tr>\n",
" <th>seg_2D</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Highly active (high int, high freq)</th>\n",
" <td>109</td>\n",
" <td>106244.381208</td>\n",
" <td>7877.054714</td>\n",
" <td>1.000000</td>\n",
" <td>7.201297</td>\n",
" <td>3861.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Small rebalancers (low int, high freq)</th>\n",
" <td>106</td>\n",
" <td>108438.852153</td>\n",
" <td>4100.832454</td>\n",
" <td>1.000000</td>\n",
" <td>2.468000</td>\n",
" <td>2067.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dormant (low int, low freq)</th>\n",
" <td>104</td>\n",
" <td>55310.790504</td>\n",
" <td>1687.835370</td>\n",
" <td>0.632500</td>\n",
" <td>1.641374</td>\n",
" <td>110.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Occasional large movers (high int, low freq)</th>\n",
" <td>102</td>\n",
" <td>37406.845662</td>\n",
" <td>2949.680688</td>\n",
" <td>0.830986</td>\n",
" <td>8.951903</td>\n",
" <td>536.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_clients aum_qty_med \\\n",
"seg_2D \n",
"Highly active (high int, high freq) 109 106244.381208 \n",
"Small rebalancers (low int, high freq) 106 108438.852153 \n",
"Dormant (low int, low freq) 104 55310.790504 \n",
"Occasional large movers (high int, low freq) 102 37406.845662 \n",
"\n",
" gross_flow_qty_med freq_med \\\n",
"seg_2D \n",
"Highly active (high int, high freq) 7877.054714 1.000000 \n",
"Small rebalancers (low int, high freq) 4100.832454 1.000000 \n",
"Dormant (low int, low freq) 1687.835370 0.632500 \n",
"Occasional large movers (high int, low freq) 2949.680688 0.830986 \n",
"\n",
" rel_int_med n_tx_med \n",
"seg_2D \n",
"Highly active (high int, high freq) 7.201297 3861.0 \n",
"Small rebalancers (low int, high freq) 2.468000 2067.5 \n",
"Dormant (low int, low freq) 1.641374 110.5 \n",
"Occasional large movers (high int, low freq) 8.951903 536.0 "
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profile_2d = (\n",
" dfc.groupby(\"seg_2D\")\n",
" .agg(\n",
" n_clients=(ID_COL, \"count\"),\n",
" aum_qty_med=(\"aum_qty_mean\",\"median\"),\n",
" gross_flow_qty_med=(\"gross_flow_qty_mean\",\"median\"),\n",
" freq_med=(\"frequency\",\"median\"),\n",
" rel_int_med=(\"rel_intensity_total\",\"median\"),\n",
" n_tx_med=(\"n_tx_total\",\"median\"),\n",
" )\n",
" .sort_values(\"n_clients\", ascending=False)\n",
")\n",
"profile_2d\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "0434097b-ff04-4fc7-8430-8e3e4c8ab120",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Clustering matrix shape: (421, 6)\n"
]
}
],
"source": [
"# Kmeans\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"from sklearn.mixture import GaussianMixture\n",
"\n",
"# Safety: ensure baseline-2 columns exist\n",
"dfc = dfc.copy()\n",
"dfc[\"frequency\"] = dfc[\"flow_freq\"]\n",
"dfc[\"rel_intensity_total\"] = dfc[\"gross_flow_to_aum\"]\n",
"\n",
"# Choose a compact, interpretable feature set (quantity-based)\n",
"features = [\n",
" \"log_aum_qty_mean\", # size (log)\n",
" \"log_gross_flow_qty_mean\", # activity intensity (log)\n",
" \"frequency\", # activity frequency\n",
" \"rel_intensity_total\", # turnover proxy\n",
" \"net_flow_qty_vol\", # volatility of net flows\n",
" \"n_tx_total\", # total number of transactions\n",
"]\n",
"\n",
"# Build X (drop NaNs/Infs)\n",
"X = (dfc[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"\n",
"# Keep IDs aligned\n",
"ids = dfc.loc[X.index, ID_COL].copy()\n",
"\n",
"# Standardize (critical for distance-based clustering)\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"print(\"Clustering matrix shape:\", X_scaled.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "e18be5a6-c8af-47f9-888f-3edf21bd28dd",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAcwdJREFUeJzt3XdYU2f7B/BvAiTssAkIIm5R3FVxDyruOlpbtY7W0Traqh3WDlH7qzi6+6pt37eO1rbaWrWO1hYVnDgREVCqiKBAQEGm7JzfH5TUCEiCCQnk+7muXHLOec7J/XCA3D7nGSJBEAQQERERmTCxoQMgIiIiMjQmRERERGTymBARERGRyWNCRERERCaPCRERERGZPCZEREREZPKYEBEREZHJY0JEREREJo8JEREREZk8JkRERkwkEmH58uWq7eXLl0MkEuHu3buGC8qE5OfnY9asWZDL5RCJRFi4cKGhQyIiPWFCRFTPtmzZApFIVOPr9OnThg6R/rFq1Sps2bIFc+fOxffff4+pU6fq9f2aNWuGUaNGVdn//fffw8zMDMOGDUNRUZFeYyAyVeaGDoDIVK1cuRK+vr5V9rds2dIA0VB1jhw5gl69eiE4ONhgMfzwww+YMWMGAgMDsWfPHlhaWhosFqLGjAkRkYEMHz4c3bt3N3QYDUpBQQFsbGzq7f0yMjLg5+ens+uVlZVBqVRCIpFoVH779u2YPn06Bg8ejN9++43JEJEe8ZEZUQN09+5dTJw4Efb29nB2dsZrr71W5VFKWVkZPvjgA7Ro0QJSqRTNmjXDO++8g+LiYlWZxYsXw9nZGYIgqPa98sorEIlE+OKLL1T70tPTIRKJsHHjxlpj27ZtG3r06AFra2s4Ojqif//++Ouvv1THH+4XValZs2aYMWOGarvy0eLRo0cxb948uLm5wcvLCzt37lTtf9jXX38NkUiEmJgY1b6rV6/i6aefhpOTEywtLdG9e3fs3bv3kXUIDw+HSCRCYmIiDhw4oHqcefPmTQAVidLMmTPh7u4OS0tLdOrUCVu3blW7xs2bNyESifDRRx/hs88+U92HuLi4Wr+HAPDzzz/j+eefx8CBA7F3795akyGRSIQFCxbgl19+gZ+fH6ysrBAQEIDLly+rvjctW7aEpaUlBg4cqKrLg86cOYNhw4ZBJpPB2toaAwYMwMmTJ9XKJCUlYd68eWjTpg2srKzg7OyMZ555psr1Ku/fyZMnsXjxYri6usLGxgbjxo3DnTt31MqeP38eQUFBcHFxgZWVFXx9ffHiiy9q9H0i0hW2EBEZSE5OTpXO0SKRCM7OzrWeO3HiRDRr1gwhISE4ffo0vvjiC9y7dw/fffedqsysWbOwdetWPP3003j99ddx5swZhISE4MqVK9i9ezcAoF+/fvj0008RGxuLDh06AACOHz8OsViM48eP49VXX1XtA4D+/fs/Mq4VK1Zg+fLl6N27N1auXAmJRIIzZ87gyJEjGDp0qObfnAfMmzcPrq6uWLZsGQoKCjBy5EjY2tri559/xoABA9TK7tixA+3bt1fVJTY2Fn369EGTJk3w9ttvw8bGBj///DPGjh2LX3/9FePGjav2Pdu1a4fvv/8eixYtgpeXF15//XUAgKurKwoLCzFw4EBcv34dCxYsgK+vL3755RfMmDED2dnZeO2119SutXnzZhQVFWHOnDmQSqVwcnKqtc6//vorpkyZgv79+2Pfvn2wsrLS6Ht1/Phx7N27F/PnzwcAhISEYNSoUXjrrbewYcMGzJs3D/fu3cPatWvx4osv4siRI6pzjxw5guHDh6Nbt24IDg6GWCzG5s2bMXjwYBw/fhw9evQAAJw7dw6nTp3Cc889By8vL9y8eRMbN27EwIEDERcXB2tra7WYXnnlFTg6OiI4OBg3b97EZ599hgULFmDHjh0AKpLLoUOHwtXVFW+//TYcHBxw8+ZN7Nq1S6M6E+mMQET1avPmzQKAal9SqVStLAAhODhYtR0cHCwAEMaMGaNWbt68eQIA4dKlS4IgCEJUVJQAQJg1a5ZauTfeeEMAIBw5ckQQBEHIyMgQAAgbNmwQBEEQsrOzBbFYLDzzzDOCu7u76rxXX31VcHJyEpRKZY31unbtmiAWi4Vx48YJ5eXlascePO/hOlXy8fERpk+fXuX71LdvX6GsrEyt7KRJkwQ3Nze1/WlpaYJYLBZWrlyp2jdkyBDB399fKCoqUould+/eQqtWrWqsy4MxjRw5Um3fZ599JgAQtm3bptpXUlIiBAQECLa2tkJubq4gCIKQmJgoABDs7e2FjIyMWt+r8v08PT0Fc3NzYeDAgUJBQYFG5wmCoPr5SUxMVO37+uuvBQCCXC5XxSUIgrB06VIBgKqsUqkUWrVqJQQFBandq/v37wu+vr7Ck08+qbbvYREREQIA4bvvvlPtq7x/gYGBatdctGiRYGZmJmRnZwuCIAi7d+8WAAjnzp3TuK5E+sBHZkQGsn79eoSGhqq9/vjjD43OrWwBqPTKK68AAH7//Xe1fxcvXqxWrrKl48CBAwAqWjzatm2LY8eOAQBOnjwJMzMzvPnmm0hPT8e1a9cAVLQ89O3bFyKRqMaY9uzZA6VSiWXLlkEsVv/T8qjzajN79myYmZmp7Xv22WeRkZGB8PBw1b6dO3dCqVTi2WefBQBkZWXhyJEjmDhxIvLy8nD37l3cvXsXmZmZCAoKwrVr15CSkqJ1PL///jvkcjkmTZqk2mdhYYFXX30V+fn5VR7lTZgwAa6urhpfPysrC2VlZfDy8tK4ZajSkCFD0KxZM9V2z549VTHY2dlV2X/jxg0AQFRUFK5du4bJkycjMzNT9b0qKCjAkCFDcOzYMSiVSgBQi6m0tBSZmZlo2bIlHBwcEBkZWSWmOXPmqN3/fv36oby8HElJSQAABwcHAMD+/ftRWlqqVX2JdImPzIgMpEePHnXuVN2qVSu17RYtWkAsFqv6cSQlJUEsFlcZsSaXy+Hg4KD6MAIqPqAqE6jjx4+je/fu6N69O5ycnHD8+HG4u7vj0qVLmDx58iNjSkhIgFgs1mknZADVjsSr7OeyY8cODBkyBEDF47LOnTujdevWAIDr169DEAS8//77eP/996u9dkZGBpo0aaJVPElJSWjVqlWVpK9du3aq47XF/yhDhgxB06ZNsXHjRjg5OeHzzz9XHcvJyUFhYaFqWyKRqD2Ca9q0qdq1ZDIZAMDb27va/ffu3QMAVeI7ffr0GuPKycmBo6MjCgsLERISgs2bNyMlJUWt/1lOTk6V8x6OydHRUe29BwwYgAkTJmDFihX49NNPMXDgQIwdOxaTJ0+GVCqtMR4iXWNCRNQI1NQCo0nLTN++ffHf//4XN27cwPHjx9GvXz+IRCL07dsXx48fh6enJ5RKJfr166frsNWUl5dXu7+6VhKpVIqxY8di9+7d2LBhA9LT03Hy5EmsWrVKVaayReONN95AUFBQtdeujykOtG3lAYD//Oc/uHfvHr744gs4OjqqOqG/9tprap23BwwYoNZK9nBLWm37K5OZyu/VunXr0Llz52rL2traAqhojdy8eTMWLlyIgIAAyGQyiEQiPPfcc6rraPPeIpEIO3fuxOnTp7Fv3z78+eefePHFF/Hxxx/j9OnTqvcl0jcmREQN0LVr19RaHq5fvw6lUql6XOLj4wOlUolr166pWi6AitFi2dnZ8PHxUe2rTHRCQ0Nx7tw5vP322wAqOlBv3LgRnp6esLGxQbdu3R4ZU4sWLaBUKhEXF1fjhypQ0UKQnZ2ttq+kpARpaWmaVF3l2WefxdatW3H48GFcuXIFgiCoHpcBQPPmzQFUPM4KDAzU6tqP4uPjg+joaCiVSrVWoqtXr6qOPy6xWIzvvvsOOTk5WLFiBZycnPDqq6/irbfewvPPP68qV9na8rhatGgBALC3t6/1e7Vz505Mnz4dH3/8sWpfUVFRlXuqrV69eqFXr1748MMP8eOPP2LKlCnYvn07Zs2a9VjXJdIU+xARNUDr169X2/7yyy8BVMxtBAAjRowAAHz22Wdq5T755BMAwMiRI1X7fH190aRJE3z66ac
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAiUlJREFUeJzt3Xd4k+X6B/BvkjZJZ7onpQtKKavKqCBLKRZElgPkoAwVFUVFcMDxh5WjHobj4OCAogzleOSooOAoyigCFouUVUYppVBG994jeX9/pAmErqRktfl+rquX9s2bN/fbFHLzPPfz3CJBEAQQERER2RCxpQMgIiIiMjcmQERERGRzmAARERGRzWECRERERDaHCRARERHZHCZAREREZHOYABEREZHNYQJERERENocJEBEREdkcJkBERhISEoJZs2Zpv09MTIRIJEJiYqL22MiRI9G7d2/zB0dm88477yAsLAwSiQTR0dGWDoeIWsAEiKgNJ0+exIMPPojg4GDI5XIEBgZi9OjR+OijjywdmkmcPn0ab7zxBi5evNjksX//+9/YuHGj2WPqKH799Ve88soruPPOO7Fhwwb885//NOnrzZo1C87Ozk2OnzhxAl5eXggJCWn2fSQiwM7SARBZsz/++AN33XUXunbtijlz5sDPzw+XL1/GoUOH8MEHH+C5557TnpuWlgaxuOP/m+L06dNYunQpRo4ciZCQEJ3H/v3vf8PLy0tnpIuu27NnD8RiMT7//HNIpVKLxJCamopRo0bByckJe/fubfIeEpEaEyCiVrz99ttQKBQ4fPgw3NzcdB7Ly8vT+V4mk5kxMjImQRBQU1MDBweHW7pOXl4eHBwcjJb8GBrXqVOncPfdd8PBwQF79+5FaGioUeIg6ow6/j9XiUwoIyMDvXr1apL8AICPj4/O9zfXALXm9OnTuOuuu+Do6IjAwECsXLmyyTl5eXl4/PHH4evrC7lcjn79+mHTpk065zRXZwQAFy9ehEgkajJddfbsWTz44IPw8PCAXC7HgAEDsH37du3jGzduxEMPPQQAuOuuuyASibTXDwkJwalTp7Bv3z7t8ZEjR2qfW1JSgvnz5yMoKAgymQzdunXDihUroFKp2vx5/PXXX4iLi4OXlxccHBwQGhqKxx57TOcclUqFDz74AH369IFcLoe3tzfGjBmDv/76S3tOQ0MD3nzzTYSHh0MmkyEkJAR///vfUVtbq3OtkJAQ3Hfffdi5cycGDBgABwcHfPLJJ7d0HyKRCBs2bEBlZaX256P5+RsjrracOXMGo0aNgkwmw969exEWFtbq+Zp6tBMnTmDEiBFwdHREt27d8O233wIA9u3bh5iYGDg4OKBHjx7YtWtXk2tcvXoVjz32GHx9fSGTydCrVy+sX79e55y6ujq8/vrr6N+/PxQKBZycnDBs2DDs3btX5zzN7+y7776LTz/9VPuzGjhwIA4fPqxzbk5ODmbPno0uXbpAJpPB398fEydO5HQfGYQjQEStCA4ORlJSElJTU41WvFxcXIwxY8bg/vvvx5QpU/Dtt9/i1VdfRZ8+fTB27FgAQHV1NUaOHInz589j3rx5CA0NxTfffINZs2ahpKQEL7zwgsGve+rUKdx5550IDAzEokWL4OTkhP/973+YNGkSvvvuO0yePBnDhw/H888/jw8//BB///vf0bNnTwBAz549sWrVKjz33HNwdnbGa6+9BgDw9fUFAFRVVWHEiBG4evUqnnrqKXTt2hV//PEHFi9ejOzsbKxatarFuPLy8nDPPffA29sbixYtgpubGy5evIitW7fqnPf4449j48aNGDt2LJ544gk0NDRg//79OHToEAYMGAAAeOKJJ7Bp0yY8+OCDWLhwIf78808sW7YMZ86cwbZt23Sul5aWhmnTpuGpp57CnDlz0KNHj1u6jy+//BKffvopkpOT8dlnnwEAhgwZYpS42pKWloa7774bdnZ22Lt3L8LDw9t8DqD+Xbzvvvvw8MMP46GHHsKaNWvw8MMP4z//+Q/mz5+Pp59+Gn/729/wzjvv4MEHH8Tly5fh4uICAMjNzcUdd9wBkUiEefPmwdvbG7/88gsef/xxlJWVYf78+QCAsrIyfPbZZ5g2bRrmzJmD8vJyfP7554iLi0NycnKTQvGvvvoK5eXleOqppyASibBy5Urcf//9uHDhAuzt7QEADzzwAE6dOoXnnnsOISEhyMvLw2+//YasrCxO+ZH+BCJq0a+//ipIJBJBIpEIgwcPFl555RVh586dQl1dXZNzg4ODhZkzZ2q/37t3rwBA2Lt3r/bYiBEjBADCF198oT1WW1sr+Pn5CQ888ID22KpVqwQAwubNm7XH6urqhMGDBwvOzs5CWVlZi68hCIKQmZkpABA2bNigPTZq1CihT58+Qk1NjfaYSqUShgwZInTv3l177Jtvvmn2moIgCL169RJGjBjR5Pibb74pODk5CefOndM5vmjRIkEikQhZWVlNnqOxbds2AYBw+PDhFs/Zs2ePAEB4/vnnmzymUqkEQRCEY8eOCQCEJ554Qufxl156SQAg7NmzR3ssODhYACAkJCQY7T4EQRBmzpwpODk56RwzRlytvZ69vb3g7+8vBAQENIm7NZrfxa+++kp77OzZswIAQSwWC4cOHdIe37lzZ5Pfp8cff1zw9/cXCgoKdK778MMPCwqFQqiqqhIEQRAaGhqE2tpanXOKi4sFX19f4bHHHtMe0/zOenp6CkVFRdrjP/zwgwBA2LFjh/a5AIR33nlH73slag6nwIhaMXr0aCQlJWHChAk4fvw4Vq5cibi4OAQGBupMHRnC2dkZjzzyiPZ7qVSKQYMG4cKFC9pjP//8M/z8/DBt2jTtMXt7ezz//POoqKjAvn37DHrNoqIi7NmzB1OmTEF5eTkKCgpQUFCAwsJCxMXFIT09HVevXm3X/QDAN998g2HDhsHd3V177YKCAsTGxkKpVOL3339v8bma6cUff/wR9fX1zZ7z3XffQSQSIT4+vsljIpEIgPpnBgALFizQeXzhwoUAgJ9++knneGhoKOLi4ox2Hy0xRlytUSqVKCgogIeHB7y8vAyKzdnZGQ8//LD2+x49esDNzQ09e/ZETEyM9rjm/zW/o4Ig4LvvvsP48eMhCILOzyouLg6lpaVISUkBAEgkEm1NlEqlQlFRERoaGjBgwADtOTeaOnUq3N3dtd8PGzZM57U1NVaJiYkoLi426H6JbsQpMKI2DBw4EFu3bkVdXR2OHz+Obdu24V//+hcefPBBHDt2DFFRUQZdr0uXLtoPbQ13d3ecOHFC+/2lS5fQvXv3JqvKNFNSly5dMug1z58/D0EQsGTJEixZsqTZc/Ly8hAYGGjQdTXS09Nx4sQJeHt7t3jtlowYMQIPPPAAli5din/9618YOXIkJk2ahL/97W/awvKMjAwEBATAw8OjxetcunQJYrEY3bp10znu5+cHNze3Jj+z5gqEb+U+TBlXaxwcHPDZZ59h+vTpGDduHH777Tc4OTkBUE+llpaWNnldjeZ+FxUKBYKCgpocA6BNOPLz81FSUoJPP/0Un376abNx3fiz2rRpE9577z2cPXtWJ8lt7l67du2q870mGdK8tkwmw4oVK7Bw4UL4+vrijjvuwH333YcZM2bo3BtRW5gAEelJKpVi4MCBGDhwICIiIjB79mx88803zY5KtEYikTR7XBAEg2O6+cNLQ6lU6nyvKeB96aWXWhxduPkD2hAqlQqjR4/GK6+80uzjERERLT5XJBLh22+/xaFDh7Bjxw7s3LkTjz32GN577z0cOnSo2X1uWtPSz+Rmza2supX7MGVcbXn44YdRXFyMZ555Bvfffz927NgBqVSKLVu2YPbs2Trn3vh71tLvYlu/o5rfp0ceeQQzZ85s9ty+ffsCADZv3oxZs2Zh0qRJePnll+Hj4wOJRIJly5YhIyPD4NcGgPnz52P8+PH4/vvvsXPnTixZsgTLli3Dnj17cNtttzX7fKKbMQEiagdN0W12drZJrh8cHIwTJ05ApVLpjAKdPXtW+zhw/V/HJSUlOs+/eVRBsyLI3t4esbGxrb5
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best K by silhouette: 5\n"
]
},
{
"data": {
"text/plain": [
"' Ce que cest :\\nInertia = somme des distances intra-cluster (SSE).\\nPlus elle baisse, plus les clusters sont “serrés”.\\n\\nComment lire :\\nQuand K augmente, inertia baisse toujours (normal).\\nOn cherche un “coude” : à partir dun certain K, ajouter des clusters apporte peu\\n'"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"k_range = range(2, 21)\n",
"inertias = []\n",
"silhouettes = []\n",
"\n",
"for k in k_range:\n",
" km = KMeans(n_clusters=k, n_init=30, random_state=42)\n",
" labels = km.fit_predict(X_scaled)\n",
" inertias.append(km.inertia_)\n",
" silhouettes.append(silhouette_score(X_scaled, labels))\n",
"\n",
"# Elbow plot\n",
"plt.figure()\n",
"plt.plot(list(k_range), inertias, marker=\"o\")\n",
"plt.xlabel(\"Number of clusters K\")\n",
"plt.ylabel(\"Inertia (within-cluster SSE)\")\n",
"plt.title(\"Elbow curve for K-means\")\n",
"plt.show()\n",
"\n",
"# Silhouette plot\n",
"plt.figure()\n",
"plt.plot(list(k_range), silhouettes, marker=\"o\")\n",
"plt.xlabel(\"Number of clusters K\")\n",
"plt.ylabel(\"Silhouette score\")\n",
"plt.title(\"Silhouette score for K-means\")\n",
"plt.show()\n",
"\n",
"best_k = list(k_range)[int(np.argmax(silhouettes))]\n",
"print(\"Best K by silhouette:\", best_k)\n",
"\n",
"\n",
"''' Ce que cest :\n",
"Inertia = somme des distances intra-cluster (SSE).\n",
"Plus elle baisse, plus les clusters sont “serrés”.\n",
"\n",
"Comment lire :\n",
"Quand K augmente, inertia baisse toujours (normal).\n",
"On cherche un “coude” : à partir dun certain K, ajouter des clusters apporte peu\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "2759f049-d8fe-4fee-9bc9-856a28b392a9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_clients</th>\n",
" <th>aum_qty_med</th>\n",
" <th>freq_med</th>\n",
" <th>rel_int_med</th>\n",
" <th>gross_flow_med</th>\n",
" <th>n_tx_med</th>\n",
" <th>vol_med</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster_kmeans</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2.0</th>\n",
" <td>235</td>\n",
" <td>3.936071e+04</td>\n",
" <td>0.986111</td>\n",
" <td>4.136974</td>\n",
" <td>2031.883965</td>\n",
" <td>1069.0</td>\n",
" <td>2.735326e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1.0</th>\n",
" <td>105</td>\n",
" <td>4.528840e+05</td>\n",
" <td>1.000000</td>\n",
" <td>4.651358</td>\n",
" <td>28651.252789</td>\n",
" <td>7585.0</td>\n",
" <td>3.004524e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0.0</th>\n",
" <td>66</td>\n",
" <td>6.912599e+04</td>\n",
" <td>0.109903</td>\n",
" <td>1.632692</td>\n",
" <td>2773.037334</td>\n",
" <td>7.5</td>\n",
" <td>1.080610e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4.0</th>\n",
" <td>13</td>\n",
" <td>4.783496e+04</td>\n",
" <td>0.884615</td>\n",
" <td>27.093690</td>\n",
" <td>10629.415385</td>\n",
" <td>1712.0</td>\n",
" <td>1.876254e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3.0</th>\n",
" <td>2</td>\n",
" <td>1.470709e+07</td>\n",
" <td>0.586207</td>\n",
" <td>5.705179</td>\n",
" <td>851698.564766</td>\n",
" <td>2210.5</td>\n",
" <td>3.218539e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_clients aum_qty_med freq_med rel_int_med \\\n",
"cluster_kmeans \n",
"2.0 235 3.936071e+04 0.986111 4.136974 \n",
"1.0 105 4.528840e+05 1.000000 4.651358 \n",
"0.0 66 6.912599e+04 0.109903 1.632692 \n",
"4.0 13 4.783496e+04 0.884615 27.093690 \n",
"3.0 2 1.470709e+07 0.586207 5.705179 \n",
"\n",
" gross_flow_med n_tx_med vol_med \n",
"cluster_kmeans \n",
"2.0 2031.883965 1069.0 2.735326e+03 \n",
"1.0 28651.252789 7585.0 3.004524e+04 \n",
"0.0 2773.037334 7.5 1.080610e+04 \n",
"4.0 10629.415385 1712.0 1.876254e+04 \n",
"3.0 851698.564766 2210.5 3.218539e+06 "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"km = KMeans(n_clusters=best_k, n_init=50, random_state=42)\n",
"labels_km = km.fit_predict(X_scaled)\n",
"\n",
"dfc.loc[X.index, \"cluster_kmeans\"] = labels_km\n",
"\n",
"# Profiling table (medians = robust to outliers)\n",
"k_profile = (\n",
" dfc.loc[X.index]\n",
" .groupby(\"cluster_kmeans\")\n",
" .agg(\n",
" n_clients=(ID_COL, \"count\"),\n",
" aum_qty_med=(\"aum_qty_mean\", \"median\"),\n",
" freq_med=(\"frequency\", \"median\"),\n",
" rel_int_med=(\"rel_intensity_total\", \"median\"),\n",
" gross_flow_med=(\"gross_flow_qty_mean\", \"median\"),\n",
" n_tx_med=(\"n_tx_total\", \"median\"),\n",
" vol_med=(\"net_flow_qty_vol\", \"median\"),\n",
" )\n",
" .sort_values(\"n_clients\", ascending=False)\n",
")\n",
"\n",
"k_profile\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "f7883188-9981-431b-9d33-d330b8b9dfc2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_months</th>\n",
" <th>n_active_months</th>\n",
" <th>flow_freq</th>\n",
" <th>aum_qty_mean</th>\n",
" <th>aum_qty_median</th>\n",
" <th>net_flow_qty_sum</th>\n",
" <th>gross_flow_qty_sum</th>\n",
" <th>gross_flow_qty_mean</th>\n",
" <th>net_flow_qty_vol</th>\n",
" <th>...</th>\n",
" <th>netflow_to_aum</th>\n",
" <th>n_tx_total</th>\n",
" <th>log_aum_qty_mean</th>\n",
" <th>log_gross_flow_qty_mean</th>\n",
" <th>gross_flow_to_aum</th>\n",
" <th>seg_quantiles</th>\n",
" <th>rel_intensity_total</th>\n",
" <th>frequency</th>\n",
" <th>seg_2D</th>\n",
" <th>cluster_kmeans</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>240</th>\n",
" <td>200130818</td>\n",
" <td>58</td>\n",
" <td>10</td>\n",
" <td>0.172414</td>\n",
" <td>3.819992e+06</td>\n",
" <td>0.000000e+00</td>\n",
" <td>9.586849e+06</td>\n",
" <td>3.429192e+07</td>\n",
" <td>5.912401e+05</td>\n",
" <td>2.088032e+06</td>\n",
" <td>...</td>\n",
" <td>-4.619540e+07</td>\n",
" <td>11</td>\n",
" <td>15.155759</td>\n",
" <td>13.289979</td>\n",
" <td>8.976963</td>\n",
" <td>High-flow</td>\n",
" <td>8.976963</td>\n",
" <td>0.172414</td>\n",
" <td>Occasional large movers (high int, low freq)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>246</th>\n",
" <td>200130906</td>\n",
" <td>56</td>\n",
" <td>56</td>\n",
" <td>1.000000</td>\n",
" <td>2.559419e+07</td>\n",
" <td>2.814182e+07</td>\n",
" <td>1.482869e+07</td>\n",
" <td>6.228080e+07</td>\n",
" <td>1.112157e+06</td>\n",
" <td>4.349047e+06</td>\n",
" <td>...</td>\n",
" <td>4.092506e-03</td>\n",
" <td>4410</td>\n",
" <td>17.057876</td>\n",
" <td>13.921813</td>\n",
" <td>2.433395</td>\n",
" <td>High-flow</td>\n",
" <td>2.433395</td>\n",
" <td>1.000000</td>\n",
" <td>Small rebalancers (low int, high freq)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_months n_active_months flow_freq \\\n",
"240 200130818 58 10 0.172414 \n",
"246 200130906 56 56 1.000000 \n",
"\n",
" aum_qty_mean aum_qty_median net_flow_qty_sum gross_flow_qty_sum \\\n",
"240 3.819992e+06 0.000000e+00 9.586849e+06 3.429192e+07 \n",
"246 2.559419e+07 2.814182e+07 1.482869e+07 6.228080e+07 \n",
"\n",
" gross_flow_qty_mean net_flow_qty_vol ... netflow_to_aum n_tx_total \\\n",
"240 5.912401e+05 2.088032e+06 ... -4.619540e+07 11 \n",
"246 1.112157e+06 4.349047e+06 ... 4.092506e-03 4410 \n",
"\n",
" log_aum_qty_mean log_gross_flow_qty_mean gross_flow_to_aum \\\n",
"240 15.155759 13.289979 8.976963 \n",
"246 17.057876 13.921813 2.433395 \n",
"\n",
" seg_quantiles rel_intensity_total frequency \\\n",
"240 High-flow 8.976963 0.172414 \n",
"246 High-flow 2.433395 1.000000 \n",
"\n",
" seg_2D cluster_kmeans \n",
"240 Occasional large movers (high int, low freq) 3.0 \n",
"246 Small rebalancers (low int, high freq) 3.0 \n",
"\n",
"[2 rows x 21 columns]"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfc[dfc['cluster_kmeans']==3.0]"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "9e26e6c4-ea3e-4aad-9136-a3bbbad8ad47",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_months</th>\n",
" <th>n_active_months</th>\n",
" <th>flow_freq</th>\n",
" <th>aum_qty_mean</th>\n",
" <th>aum_qty_median</th>\n",
" <th>net_flow_qty_sum</th>\n",
" <th>gross_flow_qty_sum</th>\n",
" <th>gross_flow_qty_mean</th>\n",
" <th>net_flow_qty_vol</th>\n",
" <th>...</th>\n",
" <th>netflow_to_aum</th>\n",
" <th>n_tx_total</th>\n",
" <th>log_aum_qty_mean</th>\n",
" <th>log_gross_flow_qty_mean</th>\n",
" <th>gross_flow_to_aum</th>\n",
" <th>seg_quantiles</th>\n",
" <th>rel_intensity_total</th>\n",
" <th>frequency</th>\n",
" <th>seg_2D</th>\n",
" <th>cluster_kmeans</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>246</th>\n",
" <td>200130906</td>\n",
" <td>56</td>\n",
" <td>56</td>\n",
" <td>1.000000</td>\n",
" <td>2.559419e+07</td>\n",
" <td>2.814182e+07</td>\n",
" <td>1.482869e+07</td>\n",
" <td>6.228080e+07</td>\n",
" <td>1.112157e+06</td>\n",
" <td>4.349047e+06</td>\n",
" <td>...</td>\n",
" <td>4.092506e-03</td>\n",
" <td>4410</td>\n",
" <td>17.057876</td>\n",
" <td>13.921813</td>\n",
" <td>2.433395</td>\n",
" <td>High-flow</td>\n",
" <td>2.433395</td>\n",
" <td>1.000000</td>\n",
" <td>Small rebalancers (low int, high freq)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>355</th>\n",
" <td>364765</td>\n",
" <td>130</td>\n",
" <td>130</td>\n",
" <td>1.000000</td>\n",
" <td>2.729485e+06</td>\n",
" <td>2.268174e+06</td>\n",
" <td>5.924221e+06</td>\n",
" <td>3.910049e+07</td>\n",
" <td>3.007730e+05</td>\n",
" <td>4.111484e+05</td>\n",
" <td>...</td>\n",
" <td>9.821006e-03</td>\n",
" <td>17976</td>\n",
" <td>14.819624</td>\n",
" <td>12.614115</td>\n",
" <td>14.325226</td>\n",
" <td>High-flow</td>\n",
" <td>14.325226</td>\n",
" <td>1.000000</td>\n",
" <td>Highly active (high int, high freq)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>200127603</td>\n",
" <td>71</td>\n",
" <td>71</td>\n",
" <td>1.000000</td>\n",
" <td>1.365998e+07</td>\n",
" <td>1.293037e+07</td>\n",
" <td>1.588720e+07</td>\n",
" <td>3.896140e+07</td>\n",
" <td>5.487521e+05</td>\n",
" <td>5.535868e+05</td>\n",
" <td>...</td>\n",
" <td>2.203899e-02</td>\n",
" <td>2044</td>\n",
" <td>16.429981</td>\n",
" <td>13.215404</td>\n",
" <td>2.852229</td>\n",
" <td>High-flow</td>\n",
" <td>2.852229</td>\n",
" <td>1.000000</td>\n",
" <td>Small rebalancers (low int, high freq)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>213</th>\n",
" <td>200128363</td>\n",
" <td>67</td>\n",
" <td>55</td>\n",
" <td>0.820896</td>\n",
" <td>1.092064e+07</td>\n",
" <td>9.611394e+06</td>\n",
" <td>7.331428e+06</td>\n",
" <td>3.823942e+07</td>\n",
" <td>5.707376e+05</td>\n",
" <td>7.883754e+05</td>\n",
" <td>...</td>\n",
" <td>4.514853e-02</td>\n",
" <td>957</td>\n",
" <td>16.206165</td>\n",
" <td>13.254687</td>\n",
" <td>3.501573</td>\n",
" <td>High-flow</td>\n",
" <td>3.501573</td>\n",
" <td>0.820896</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240</th>\n",
" <td>200130818</td>\n",
" <td>58</td>\n",
" <td>10</td>\n",
" <td>0.172414</td>\n",
" <td>3.819992e+06</td>\n",
" <td>0.000000e+00</td>\n",
" <td>9.586849e+06</td>\n",
" <td>3.429192e+07</td>\n",
" <td>5.912401e+05</td>\n",
" <td>2.088032e+06</td>\n",
" <td>...</td>\n",
" <td>-4.619540e+07</td>\n",
" <td>11</td>\n",
" <td>15.155759</td>\n",
" <td>13.289979</td>\n",
" <td>8.976963</td>\n",
" <td>High-flow</td>\n",
" <td>8.976963</td>\n",
" <td>0.172414</td>\n",
" <td>Occasional large movers (high int, low freq)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210</th>\n",
" <td>200127901</td>\n",
" <td>71</td>\n",
" <td>2</td>\n",
" <td>0.028169</td>\n",
" <td>2.728996e+04</td>\n",
" <td>2.731466e+04</td>\n",
" <td>-3.729000e+01</td>\n",
" <td>3.729000e+01</td>\n",
" <td>5.252113e-01</td>\n",
" <td>3.776849e+00</td>\n",
" <td>...</td>\n",
" <td>-1.905423e-05</td>\n",
" <td>2</td>\n",
" <td>10.214311</td>\n",
" <td>0.422133</td>\n",
" <td>0.001366</td>\n",
" <td>Low-flow</td>\n",
" <td>0.001366</td>\n",
" <td>0.028169</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>200131477</td>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>7.623185e+03</td>\n",
" <td>7.623185e+03</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0</td>\n",
" <td>8.939081</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Low-flow</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238</th>\n",
" <td>200130743</td>\n",
" <td>69</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>8.899686e+03</td>\n",
" <td>8.410000e+03</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0</td>\n",
" <td>9.093884</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Low-flow</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>200127798</td>\n",
" <td>71</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>2.667762e+04</td>\n",
" <td>2.790356e+04</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0</td>\n",
" <td>10.191618</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Low-flow</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>328</th>\n",
" <td>200139346</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>2.908100e+05</td>\n",
" <td>2.908100e+05</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0</td>\n",
" <td>12.580429</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Low-flow</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Dormant (low int, low freq)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>421 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_months n_active_months flow_freq \\\n",
"246 200130906 56 56 1.000000 \n",
"355 364765 130 130 1.000000 \n",
"183 200127603 71 71 1.000000 \n",
"213 200128363 67 55 0.820896 \n",
"240 200130818 58 10 0.172414 \n",
".. ... ... ... ... \n",
"210 200127901 71 2 0.028169 \n",
"266 200131477 34 0 0.000000 \n",
"238 200130743 69 0 0.000000 \n",
"198 200127798 71 0 0.000000 \n",
"328 200139346 13 0 0.000000 \n",
"\n",
" aum_qty_mean aum_qty_median net_flow_qty_sum gross_flow_qty_sum \\\n",
"246 2.559419e+07 2.814182e+07 1.482869e+07 6.228080e+07 \n",
"355 2.729485e+06 2.268174e+06 5.924221e+06 3.910049e+07 \n",
"183 1.365998e+07 1.293037e+07 1.588720e+07 3.896140e+07 \n",
"213 1.092064e+07 9.611394e+06 7.331428e+06 3.823942e+07 \n",
"240 3.819992e+06 0.000000e+00 9.586849e+06 3.429192e+07 \n",
".. ... ... ... ... \n",
"210 2.728996e+04 2.731466e+04 -3.729000e+01 3.729000e+01 \n",
"266 7.623185e+03 7.623185e+03 0.000000e+00 0.000000e+00 \n",
"238 8.899686e+03 8.410000e+03 0.000000e+00 0.000000e+00 \n",
"198 2.667762e+04 2.790356e+04 0.000000e+00 0.000000e+00 \n",
"328 2.908100e+05 2.908100e+05 0.000000e+00 0.000000e+00 \n",
"\n",
" gross_flow_qty_mean net_flow_qty_vol ... netflow_to_aum n_tx_total \\\n",
"246 1.112157e+06 4.349047e+06 ... 4.092506e-03 4410 \n",
"355 3.007730e+05 4.111484e+05 ... 9.821006e-03 17976 \n",
"183 5.487521e+05 5.535868e+05 ... 2.203899e-02 2044 \n",
"213 5.707376e+05 7.883754e+05 ... 4.514853e-02 957 \n",
"240 5.912401e+05 2.088032e+06 ... -4.619540e+07 11 \n",
".. ... ... ... ... ... \n",
"210 5.252113e-01 3.776849e+00 ... -1.905423e-05 2 \n",
"266 0.000000e+00 0.000000e+00 ... 0.000000e+00 0 \n",
"238 0.000000e+00 0.000000e+00 ... 0.000000e+00 0 \n",
"198 0.000000e+00 0.000000e+00 ... 0.000000e+00 0 \n",
"328 0.000000e+00 0.000000e+00 ... 0.000000e+00 0 \n",
"\n",
" log_aum_qty_mean log_gross_flow_qty_mean gross_flow_to_aum \\\n",
"246 17.057876 13.921813 2.433395 \n",
"355 14.819624 12.614115 14.325226 \n",
"183 16.429981 13.215404 2.852229 \n",
"213 16.206165 13.254687 3.501573 \n",
"240 15.155759 13.289979 8.976963 \n",
".. ... ... ... \n",
"210 10.214311 0.422133 0.001366 \n",
"266 8.939081 0.000000 0.000000 \n",
"238 9.093884 0.000000 0.000000 \n",
"198 10.191618 0.000000 0.000000 \n",
"328 12.580429 0.000000 0.000000 \n",
"\n",
" seg_quantiles rel_intensity_total frequency \\\n",
"246 High-flow 2.433395 1.000000 \n",
"355 High-flow 14.325226 1.000000 \n",
"183 High-flow 2.852229 1.000000 \n",
"213 High-flow 3.501573 0.820896 \n",
"240 High-flow 8.976963 0.172414 \n",
".. ... ... ... \n",
"210 Low-flow 0.001366 0.028169 \n",
"266 Low-flow 0.000000 0.000000 \n",
"238 Low-flow 0.000000 0.000000 \n",
"198 Low-flow 0.000000 0.000000 \n",
"328 Low-flow 0.000000 0.000000 \n",
"\n",
" seg_2D cluster_kmeans \n",
"246 Small rebalancers (low int, high freq) 3.0 \n",
"355 Highly active (high int, high freq) 1.0 \n",
"183 Small rebalancers (low int, high freq) 1.0 \n",
"213 Dormant (low int, low freq) 1.0 \n",
"240 Occasional large movers (high int, low freq) 3.0 \n",
".. ... ... \n",
"210 Dormant (low int, low freq) 0.0 \n",
"266 Dormant (low int, low freq) 0.0 \n",
"238 Dormant (low int, low freq) 0.0 \n",
"198 Dormant (low int, low freq) 0.0 \n",
"328 Dormant (low int, low freq) 0.0 \n",
"\n",
"[421 rows x 21 columns]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfc.sort_values(by=\"gross_flow_qty_sum\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "2437e11b-04d2-4c49-b265-32a5681ef96a",
"metadata": {},
"outputs": [],
"source": [
"# Définition de la correspondance entre les codes numériques et les nouveaux labels\n",
"mapping = {\n",
" 0.0: \"Cluster 1 (66) : Dormant\",\n",
" 1.0: \"Cluster 2 (105) : Highly Active\",\n",
" 2.0: \"Cluster 3 (235)\",\n",
" 3.0: \"Cluster 4 (2)\",\n",
" 4.0: \"Cluster 5 (13) : Large Movers\"\n",
"}\n",
"\n",
"# Création de la nouvelle colonne 'cluster'\n",
"dfc['cluster'] = dfc['cluster_kmeans'].map(mapping)"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "2a677706-ae26-4474-8a69-a0e486421feb",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAqqNJREFUeJzs3XlczPkfB/DXdB906NR9iUqRwmKJlU1sznUvybGOyLXIsu5rWdYVYd3LuhZr3bS5+bkqR26RK6EUlY6Zz++Ptu82zUzN1FzV+/l49GA+3+s936aZ93xOHmOMgRBCCCGkGtNQdQCEEEIIIapGCREhhBBCqj1KiAghhBBS7VFCRAghhJBqjxIiQgghhFR7lBARQgghpNqjhIgQQggh1R4lRIQQQgip9ighIoQQQki1RwmRmmndujXq168v13PyeDyMGjVKrueUxubNm8Hj8fD06VOlXxsABg4cCCcnJ5Vcmwh7+vQpeDweNm/erOpQ5EZRz8nJyQkDBw6U6znV+bqK0Lp1a7Ru3VrVYcjFtm3bUK9ePWhra8PExETV4VRplBCV4urVqxg1ahS8vLxgaGgIBwcH9OzZEw8ePBDZt3Xr1uDxeODxeNDQ0ICRkRHq1q2L/v374+TJkyqInhDZJCYmYubMmRVKYHfs2IFly5bJLaaq6uLFi5g5cyY+fPig6lDkYvXq1Wqd7L569QozZ85EfHy8qkORyb179zBw4EC4urpi/fr1WLdunapDqtK0VB2AOvv5559x4cIF9OjRAz4+PkhJScGqVavQqFEjXL58WaQmx87ODgsWLAAAZGVl4dGjR9i3bx9+//139OzZE7///ju0tbVV8VRUon///ujduzd0dXVVHQqRQmJiImbNmoXWrVuXu2Ztx44duH37NsaOHStU7ujoiJycnGr1+i/NxYsXMWvWLAwcOFDkW//9+/ehoaH876oVue7q1athbm6uNjVMJ06cEHr86tUrzJo1C05OTmjYsKFqgiqH06dPQyAQYPny5XBzc1N1OFUeJUSlGD9+PHbs2AEdHR2urFevXvD29sbChQvx+++/C+1vbGyM7777Tqhs4cKFiIiIwOrVq+Hk5ISff/5ZKbGrA01NTWhqasrtfNnZ2TAwMJDb+Yjy8Hg86OnpqTqMUgkEAuTl5ak8TlV9gahKX1yKv2dXZqmpqQBQZlMZYwyfP3+Gvr6+EqKqwhiRWaNGjVijRo2EygICApiXl5fY/QsKCpinpyczMDBgHz58KPXcRee5du0aa9asGdPT02NOTk5szZo1Ivt+/vyZTZ8+nbm6ujIdHR1mZ2fHJk6cyD5//iy0HwAWHh7O9u/fz7y8vJiOjg7z9PRkR48eFdrv6dOnbMSIEczd3Z3p6emxWrVqsW+//ZYlJSVx+1y9epUBYJs3bxaJ59ixYwwA+/vvvxljjG3atIkBEDqeMcaioqKYp6cn09HRYbVr12YjR45k6enpEu9Dy5Ytmb6+PhszZgxjjLEDBw6wDh06sNq1azMdHR3m4uLCZs+ezQoKCoTOERoayhwdHUu52/89p6+//pqZmZlx9zssLExoHz6fz3799Vfm6enJdHV1maWlJfv+++9ZWlqayH4zZsxgtWvXZvr6+qx169bszp07zNHRkYWGhnL7Fd2bc+fOsdGjRzNzc3NmbGzMvv/+e5abm8vS09NZ//79mYmJCTMxMWETJ05kAoGgXDE5Ojqyjh07snPnzrHGjRszXV1d5uzszLZs2SIST8mf2NhYqe95QECAyPFF9z8pKYkBYJs2bRKKLSYmhn355ZfMwMCAGRsbs06dOrHExEShfWbMmMEAsIcPH7LQ0FBmbGzMjIyM2MCBA1lWVpbQvm/fvmV3794VKRen6O/i999/Z56enkxLS4vt37+fMcbYixcvWFhYGLO0tOT+XjZs2CB0vLjnlJCQwEJDQ5mzszPT1dVlVlZWLCwsjL17907k+ZT8Kfo7Kf5akeXvTdq4JZH0Gj1//jwbN24cMzc3ZwYGBqxLly4sNTVV6LiSzyUgIIDbnp6ezsaMGcPs7OyYjo4Oc3V1ZQsXLmR8Pl/kXi5evJitXbuWubi4MB0dHebv78+uXLkiFOfr16/ZwIEDma2tLdPR0WHW1tasU6dOQu8zAQEBXAyxsbFi7/emTZvY9OnTmZaWltDzKTJ06FBmbGzMcnJyxN6vxYsXMwDs6dOnItsiIyOZtrY297f44MED1q1bN2ZlZcV0dXWZra0t69WrV6mfB+Lu64wZM7htHTt2ZMeOHWN+fn5MV1eX/frrr1Lf76L9QkNDmZGRETM2NmYDBgxgcXFxIq/p4veyOHHvr/J8Tyoe59ixY5mjoyPT0dFhtra2rH///uzt27fs48ePzMDAgEVERIgc9/z5c6ahocHmz58v8R6XRAmRjAQCAbO1tWVff/21UHlpCRFjjM2ZM4cBYIcOHSr1/AEBAczGxoZZWlqyUaNGsRUrVrAvv/ySARB6Y+Pz+ezrr79mBgYGbOzYsWzt2rVs1KhRTEtLi3Xu3FnonABYgwYNWO3atdmcOXPYsmXLmIuLCzMwMBB6o96zZw9r0KABmz59Olu3bh378ccfmampKXN0dBT6gHFxcWEdOnQQiT0sLIyZmpqyvLw8xpj4hKjowyAwMJCtXLmSjRo1imlqarLGjRtzxxXdB2tra2ZhYcFGjx7N1q5dyw4cOMAYY6xLly6sZ8+ebPHixWzNmjWsR48eDAD74YcfhOKRJiF68+YNMzU1Ze7u7mzx4sVs/fr1bOrUqczDw0NovyFDhjAtLS02dOhQFh0dzSZPnswMDQ1F4p40aRIDwEJCQtiqVavY0KFDmZ2dHTM3Nxf7YdOwYUPWvn17FhUVxfr3788AsEmTJrEvv/yS9e3bl61evZp98803DIDIm4W0MTk6OrK6desyKysr9uOPP7JVq1axRo0aMR6Px27fvs0YY+zx48csIiKCAWA//vgj27ZtG9u2bRtLSUmR+p6fOHGCNWzYkJmbm3PHFyUY4pKHkydPMi0tLebu7s4WLVrEZs2axczNzZmpqanY14yvry/r1q0bW716NRsyZAh3r4or2rcokSsNAObh4cEsLCzYrFmzWFRUFIuLi2MpKSnMzs6O2dvbs9mzZ7M1a9awTp06MQDch46k5/TLL7+wli1bstmzZ7N169axMWPGMH19fdakSRMuoU1ISGB9+vThzld0rz59+sT9voq/VqT9e5M2bkkkJUS+vr7sq6++YitXrmQTJkxgmpqarGfPntx++/fvZ3Z2dqxevXrcczlx4gRjjLGsrCzm4+PDzMzM2I8//siio6PZgAEDGI/H477gFL+Xvr6+zM3Njf38889s0aJFzNzcnNnZ2Qm9nps3b86MjY3ZtGnT2G+//cbmz5/P2rRpw86cOcPtU/xDPCUlhc2ePZsBYN9//z0X4+PHj9nDhw8ZALZy5Uqhe5Gbm8tMTU3ZoEGDJN6vZ8+eMR6PxxYtWiSyzcXFhXXs2JE7l7OzM7OxsWFz585lv/32G5s1axZr3Lix2GSq+H3t2rUrA8DWrFnDtm3bxhISErjflZubGzM1NWWRkZEsOjqaxcbGSn2/BQIBa9WqFdPQ0GAjR45kK1euZF999RXz8fGpUEIkz/ckxhj7+PEjq1+/PtPU1GRDhw5la9asYXPmzGGNGzdmcXFxjDHG+vXrx6ysrES+EC9atIjxeDz27Nkzife4JEqIZLRt2zaR5ISxshOi/fv3MwBs+fLlpZ6/6Fv2kiVLuLLc3FzWsGFDZmlpyb2otm3bxjQ0NNi5c+eEjo+OjmYA2IULF7gyAExHR4c9evSIK0tISBB5I8jOzhaJ59KlSwwA27p1K1c2ZcoUoW8/RTGamJgIvYGUTIhSU1OZjo4O+/rrr4W+raxatYoBYBs3bhS5D9HR0SIxiYtz2LBhzMDAQKh2TJqEqOj3cvXqVYn7nDt3jgFg27dvFyov+oZeVJ6SksK0tLRYly5dhPabOXMmAyD2wyYoKEio5qdZs2aMx+Ox4cOHc2UFBQXMzs5O6E1J2pgY+++b5tmzZ7my1NRUpquryyZMmMC
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"for name, g in dfc[~dfc['cluster_kmeans'].isin([2.0, 3.0])].groupby(\"cluster\"):\n",
" plt.scatter(g[\"frequency\"], g[\"rel_intensity_total\"], s=10, label=name)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.axvline(thr_freq, linestyle=\"--\")\n",
"plt.axhline(thr_int, linestyle=\"--\")\n",
"plt.xlabel(\"Activity frequency (share of active months)\")\n",
"plt.ylabel(\"Gross flow / mean AUM (quantity) [log scale]\")\n",
"plt.title(\"2D behavioral segmentation: relative intensity vs frequency\")\n",
"plt.legend(markerscale=2)\n",
"plt.ylim(0.1,100)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "41f5ffd2-1c90-48a6-be8f-adcce2d42185",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAHHCAYAAAC7soLdAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAjhBJREFUeJzt3XdYU9f/B/B3GGHJUpAhS3CBA3AWHLgp7lHFUbe2VdxWxdq662q1tYqzrq97obV1i3u0DkZVBBVRcC+GijKS8/vDHykhCeSGjJvweT0Pz0NOTu795Obm3k/OPedcAWOMgRBCCCHEgBnpOgBCCCGEEE2jhIcQQgghBo8SHkIIIYQYPEp4CCGEEGLwKOEhhBBCiMGjhIcQQgghBo8SHkIIIYQYPEp4CCGEEGLwKOEhhBBCiMGjhEcHWrZsiTp16qh1mQKBAKNHj1brMpWxadMmCAQCPHjwQOvrBoDBgwfDy8tLJ+sm0h48eACBQIBNmzbpOhS10dR78vLywuDBg9W6TD6vVxNatmyJli1b6joMtdiyZQtq1aoFU1NT2NnZ6Tocg1XuE56rV69i9OjRqF27NqysrODh4YHevXvjzp07MnVbtmwJgUAAgUAAIyMj2NjYoGbNmhgwYABOnDihg+gJ4SYxMRGzZs0qU4K6fft2/Prrr2qLyVBdunQJs2bNQmZmpq5DUYuVK1fyOpl98uQJZs2ahfj4eF2HwklSUhIGDx4MHx8frFu3DmvXrtV1SAbLRNcB6NqiRYtw8eJF9OrVC/Xq1cOzZ8+wYsUK1K9fH3///bdMS4ybmxsWLFgAAHj//j3u3buH6OhobN26Fb1798bWrVthamqqi7eiEwMGDECfPn1gZmam61CIEhITEzF79my0bNlS5Zax7du34+bNmxg/frxUuaenJz58+FCu9v+SXLp0CbNnz8bgwYNlfrUnJyfDyEj7vzfLst6VK1fCwcGBNy1Ex48fl3r85MkTzJ49G15eXggICNBNUCo4c+YMxGIxli1bhmrVquk6HINW7hOeiRMnYvv27RAKhZKy8PBw1K1bFwsXLsTWrVul6tva2uLLL7+UKlu4cCHGjh2LlStXwsvLC4sWLdJK7HxgbGwMY2NjtS0vJycHlpaWalse0R6BQABzc3Ndh1EisViMvLw8ncepqx8IhvTDpOgxW5+9ePECAEq9lMUYw8ePH2FhYaGFqAwUI3LVr1+f1a9fX6osJCSE1a5dW279goIC5ufnxywtLVlmZmaJyy5czrVr11hQUBAzNzdnXl5ebNWqVTJ1P378yGbMmMF8fHyYUChkbm5ubPLkyezjx49S9QCwiIgItn//fla7dm0mFAqZn58fO3LkiFS9Bw8esJEjR7IaNWowc3NzVrFiRfbFF1+w1NRUSZ2rV68yAGzTpk0y8Rw9epQBYH/++SdjjLGNGzcyAFKvZ4yxqKgo5ufnx4RCIXNxcWGjRo1iGRkZCrdD8+bNmYWFBRs3bhxjjLEDBw6wDh06MBcXFyYUCpm3tzebM2cOKygokFrGoEGDmKenZwlb+7/31L59e1apUiXJ9h4yZIhUHZFIxH755Rfm5+fHzMzMWOXKldlXX33F3rx5I1Nv5syZzMXFhVlYWLCWLVuyW7duMU9PTzZo0CBJvcJtc/78eTZmzBjm4ODAbG1t2VdffcVyc3NZRkYGGzBgALOzs2N2dnZs8uTJTCwWqxSTp6cn69ixIzt//jxr1KgRMzMzY1WrVmWbN2+Wiaf43+nTp5Xe5iEhITKvL9z+qampDADbuHGjVGwxMTGsWbNmzNLSktna2rIuXbqwxMREqTozZ85kANjdu3fZoEGDmK2tLbOxsWGDBw9m79+/l6r78uVLdvv2bZlyeQq/F1u3bmV+fn7MxMSE7d+/nzHG2KNHj9iQIUNY5cqVJd+X9evXS71e3ntKSEhggwYNYlWrVmVmZmbMycmJDRkyhL169Urm/RT/K/yeFN1XuHzflI1bEUX76IULF9iECROYg4MDs7S0ZN26dWMvXryQel3x9xISEiJ5PiMjg40bN465ubkxoVDIfHx82MKFC5lIJJLZlj/99BNbs2YN8/b2ZkKhkDVs2JBduXJFKs6nT5+ywYMHsypVqjChUMicnZ1Zly5dpI4zISEhkhhOnz4td3tv3LiRzZgxg5mYmEi9n0IjRoxgtra27MOHD3K3108//cQAsAcPHsg8FxkZyUxNTSXfxTt37rAePXowJycnZmZmxqpUqcLCw8NLPB/I264zZ86UPNexY0d29OhR1qBBA2ZmZsZ++eUXpbd3Yb1BgwYxGxsbZmtrywYOHMji4uJk9umi27IoecdXdR6TisY5fvx45unpyYRCIatSpQobMGAAe/nyJXv79i2ztLRkY8eOlXldeno6MzIyYvPnz1e4jYuihEcOsVjMqlSpwtq3by9VXlLCwxhjc+fOZQDYX3/9VeLyQ0JCmKurK6tcuTIbPXo0++2331izZs0YAKkDl0gkYu3bt2eWlpZs/PjxbM2aNWz06NHMxMSEde3aVWqZAJi/vz9zcXFhc+fOZb/++ivz9vZmlpaWUgfiPXv2MH9/fzZjxgy2du1a9t133zF7e3vm6ekpdQLx9vZmHTp0kIl9yJAhzN7enuXl5THG5Cc8hQf7tm3bsuXLl7PRo0czY2Nj1qhRI8nrCreDs7Mzc3R0ZGPGjGFr1qxhBw4cYIwx1q1bN9a7d2/2008/sVWrVrFevXoxAOzbb7+VikeZhOf58+fM3t6e1ahRg/30009s3bp1bPr06czX11eq3vDhw5mJiQkbMWIEW716NZs6dSqzsrKSiXvKlCkMAOvcuTNbsWIFGzFiBHNzc2MODg5yTyYBAQHs888/Z1FRUWzAgAEMAJsyZQpr1qwZ69evH1u5ciXr1KkTAyBzMFA2Jk9PT1azZk3m5OTEvvvuO7ZixQpWv359JhAI2M2bNxljjKWkpLCxY8cyAOy7775jW7ZsYVu2bGHPnj1TepsfP36cBQQEMAcHB8nrCxMIecnBiRMnmImJCatRowZbvHgxmz17NnNwcGD29vZy95nAwEDWo0cPtnLlSjZ8+HDJtiqqsG5holYSAMzX15c5Ojqy2bNns6ioKBYXF8eePXvG3NzcmLu7O5szZw5btWoV69KlCwMgOakoek8///wza968OZszZw5bu3YtGzduHLOwsGCNGzeWJKwJCQmsb9++kuUVbqt3795JPq+i+4qy3zdl41ZEUcITGBjIWrduzZYvX84mTZrEjI2NWe/evSX19u/fz9zc3FitWrUk7+X48eOMMcbev3/P6tWrxypVqsS+++47tnr1ajZw4EAmEAgkP2CKbsvAwEBWrVo1tmjRIrZ48WLm4ODA3NzcpPbn4OBgZmtry77//nv2+++/s/nz57NWrVqxs2fPSuoUPUk/e/aMzZkzhwFgX331lSTGlJQUdvfuXQaALV++XGpb5ObmMnt7ezZ06FCF2+vhw4dMIBCwxYsXyzzn7e3NOnbsKFlW1apVmaurK5s3bx77/fff2ezZs1mjRo3kJktFt2v37t0ZALZq1Sq2ZcsWlpCQIPmsqlWrxuzt7VlkZCRbvXo1O336tNLbWywWsxYtWjAjIyM2atQotnz5cta6dWtWr169MiU86jwmMcbY27dvWZ06dZixsTEbMWIEW7VqFZs7dy5r1KgRi4uLY4wx1r9/f+bk5CTzg3fx4sVMIBCwhw8fKtzGRVHCI8eWLVtkkg/GSk949u/fzwCwZcuWlbj8wl/JS5YskZTl5uaygIAAVrlyZclOs2XLFmZkZMTOnz8v9frVq1czAOzixYuSMgBMKBSye/fuScoSEhJkvug5OTky8Vy+fJkBYP/73/8kZdOmTZP69VIYo52dndQBonjC8+LFCyYUCln79u2lfm2sWLGCAWAbNmyQ2Q6rV6+WiUlenF9//TWztLSUat1SJuEp/FyuXr2qsM758+cZALZt2zap8sJf2IXlz549YyYmJqxbt25S9WbNmsUAyD2ZhIaGSrXcBAUFMYFAwL755htJWUFBAXNzc5M66CgbE2P//VI8d+6cpOz
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"for name, g in dfc[dfc['cluster_kmeans']==2.0].groupby(\"cluster\"): \n",
" plt.scatter(g[\"frequency\"], g[\"rel_intensity_total\"], s=10, label=name)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.axvline(thr_freq, linestyle=\"--\")\n",
"plt.axhline(thr_int, linestyle=\"--\")\n",
"plt.xlabel(\"Activity frequency (share of active months)\")\n",
"plt.ylabel(\"Gross flow / mean AUM (quantity) [log scale]\")\n",
"plt.title(\"2D behavioral segmentation: relative intensity vs frequency\")\n",
"plt.legend(markerscale=2)\n",
"plt.show()\n",
"\n",
"\n",
"#\"log_aum_qty_mean\", # size (log)\n",
" # \"log_gross_flow_qty_mean\", # activity intensity (log)\n",
" # \"frequency\", # activity frequency\n",
" # \"rel_intensity_total\", # turnover proxy\n",
" # \"net_flow_qty_vol\", # volatility of net flows\n",
" # \"n_tx_total\", "
]
},
{
"cell_type": "code",
"execution_count": 206,
"id": "ba298e96-5919-44d1-91e3-67d38041349f",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAHqCAYAAADVi/1VAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA1CdJREFUeJzs3XdYU9cbB/Bv2ENBFFwsBScqigtHLVq1uK3aatW6tWpxW8VVZ911VXHW/XNVq9Zqratq3XvjXtS9caCsnN8faSIhgyRkwvfzPDyQk3PvfXO5uRzenCERQggQERERERERERGZkZ2lAyAiIiIiIiIiouyHSSkiIiIiIiIiIjI7JqWIiIiIiIiIiMjsmJQiIiIiIiIiIiKzY1KKiIiIiIiIiIjMjkkpIiIiIiIiIiIyOyaliIiIiIiIiIjI7JiUIiIiIiIiIiIis2NSioiIiIiIiIiIzI5JqWykZs2aKF26tFH3KZFI0KtXL6PuUxfLli2DRCLBnTt3zH5sAOjYsSMKFSpkkWOTsjt37kAikWDZsmWWDsVoTPWaChUqhI4dOxp1n9Z8XFOoWbMmatasaekwjGLlypUoUaIEHB0dkStXLkuHQ6Q3tmuMh+0a68F2je7Yrsk8tmvIGjApZSEnTpxAr169UKpUKbi7uyMgIAAtW7bEtWvXVOrWrFkTEokEEokEdnZ28PDwQPHixdGuXTvs2rXLAtET6Sc2NhajR4/OVGN79erVmDlzptFiyqoOHz6M0aNH49WrV5YOxSjmzp1r1Q3zBw8eYPTo0Th79qylQ9HLlStX0LFjRwQHB2PRokVYuHChpUMiG8d2DWUnbNeYD9s15sV2DVmCg6UDyK4mT56MQ4cO4auvvkJoaCgePXqEOXPmoHz58jh69KjKJ39+fn6YOHEiAODdu3e4ceMGNm7ciP/9739o2bIl/ve//8HR0dESL8Ui2rVrh6+//hrOzs6WDoV0EBsbizFjxqBmzZoGfxK7evVqXLx4Ef369VMqDwwMxPv377PV9a/N4cOHMWbMGHTs2FHlU6KrV6/Czs78n0Vk5rhz586Ft7e31XwiuXPnTqXHDx48wJgxY1CoUCGUK1fOMkEZYN++fZBKpZg1axaKFCli6XAoC2C7JnPYrrEtbNeYD9s1psV2DVkDJqUsZMCAAVi9ejWcnJwUZa1atUKZMmUwadIk/O9//1Oq7+npiW+++UapbNKkSejTpw/mzp2LQoUKYfLkyWaJ3RrY29vD3t7eaPtLSEiAm5ub0fZH5iORSODi4mLpMLSSSqVISkqyeJyW+mcnK/2TlfaebcuePHkCABl2bxdC4MOHD3B1dTVDVGTL2K7JHLZrSI7tGt2xXZN5bNeQVRBkVcqXLy/Kly+vVBYRESFKlSqltn5KSooICQkRbm5u4tWrV1r3Ld/PyZMnRdWqVYWLi4soVKiQmDdvnkrdDx8+iJEjR4rg4GDh5OQk/Pz8xKBBg8SHDx+U6gEQUVFRYtOmTaJUqVLCyclJhISEiO3btyvVu3PnjujZs6coVqyYcHFxEblz5xZffvmluH37tqLOiRMnBACxbNkylXj++usvAUD88ccfQgghli5dKgAobS+EEDExMSIkJEQ4OTmJAgUKiO+++068fPlS43moUaOGcHV1FX379hVCCLF582bRoEEDUaBAAeHk5CSCgoLE2LFjRUpKitI+OnToIAIDA7Wc7Y+v6fPPPxd58uRRnO9OnTop1UlNTRUzZswQISEhwtnZWeTNm1d8++234sWLFyr1Ro0aJQoUKCBcXV1FzZo1xaVLl0RgYKDo0KGDop783Bw4cED07t1beHt7C09PT/Htt9+KxMRE8fLlS9GuXTuRK1cukStXLjFo0CAhlUoNiikwMFA0bNhQHDhwQFSqVEk4OzuLwoULi+XLl6vEk/5r7969Op/ziIgIle3l5//27dsCgFi6dKlSbHv27BGffPKJcHNzE56enqJJkyYiNjZWqc6oUaMEAHH9+nXRoUMH4enpKTw8PETHjh3Fu3fvlOo+ffpUXL58WaVcHfn74n//+58ICQkRDg4OYtOmTUIIIe7duyc6deok8ubNq3i/LF68WGl7da/p3LlzokOHDqJw4cLC2dlZ5MuXT3Tq1Ek8e/ZM5fWk/5K/T9JeK/q833SNWxNN1+jBgwdF//79hbe3t3BzcxNffPGFePLkidJ26V9LRESE4vmXL1+Kvn37Cj8/P+Hk5CSCg4PFpEmTRGpqqsq5nDp1qliwYIEICgoSTk5OomLFiuL48eNKcT58+FB07NhR+Pr6CicnJ5E/f37RpEkTpftMRESEIoa9e/eqPd9Lly4VI0eOFA4ODkqvR65bt27C09NTvH//Xu35mjp1qgAg7ty5o/LckCFDhKOjo+K9eO3aNdG8eXORL18+4ezsLHx9fUWrVq20/j1Qd15HjRqleK5hw4bir7/+EhUqVBDOzs5ixowZOp9veb0OHToIDw8P4enpKdq3by/OnDmjck2nPZdpqbu/GvOelDbOfv36icDAQOHk5CR8fX1Fu3btxNOnT8WbN2+Em5ub6NOnj8p2//77r7CzsxMTJkzQeI7pI7Zr2K5hu4btGrZrPm7Hdo0ytmvYrhFCCCalrIhUKhW+vr7i888/VyrX1ngTQohx48YJAGLr1q1a9x8RESEKFiwo8ubNK3r16iV+/vln8cknnwgASjfh1NRU8fnnnws3NzfRr18/sWDBAtGrVy/h4OAgmjZtqrRPAKJs2bKiQIECYty4cWLmzJkiKChIuLm5Kf1RWb9+vShbtqwYOXKkWLhwoRg2bJjw8vISgYGBSn8Mg4KCRIMGDVRi79Spk/Dy8hJJSUlCCPWNN/kfrjp16ojZs2eLXr16CXt7e1GpUiXFdvLzkD9/fuHj4yN69+4tFixYIDZv3iyEEOKLL74QLVu2FFOnThXz5s0TX331lQAgvv/+e6V4dGm8PX78WHh5eYlixYqJqVOnikWLFonhw4eLkiVLKtXr2rWrcHBwEN26dRPz588X0dHRwt3dXSXuwYMHCwCicePGYs6cOaJbt27Cz89PeHt7q/3DWK5cOVGvXj0RExMj2rVrJwCIwYMHi08++US0adNGzJ07VzRq1EgAULmx6RpTYGCgKF68uMiXL58YNmyYmDNnjihfvryQSCTi4sWLQgghbt68Kfr06SMAiGHDhomVK1eKlStXikePHul8znfu3CnKlSsnvL29FdvLG0PqGjq7du0SDg4OolixYmLKlClizJgxwtvbW3h5eam9ZsLCwkTz5s3F3LlzRdeuXRXnKi15XXmjUxsAomTJksLHx0eMGTNGxMTEiDNnzohHjx4JPz8/4e/vL8aOHSvmzZsnmjRpIgAo/kBqek0//fSTqFGjhhg7dqxYuHCh6Nu3r3B1dRWVK1dWNL7PnTsnWrdurdif/Fy9fftW8ftKe63o+n7TNW5NNDXewsLCxGeffSZmz54tBg4cKOzt7UXLli0V9TZt2iT8/PxEiRIlFK9l586dQggh3r17J0JDQ0WePHnEsGHDxPz580X79u2FRCJR/DOW9lyGhYWJIkWKiMmTJ4spU6YIb29v4efnp3Q9V6tWTXh6eooRI0aIX375RUyYMEHUqlVL7N+/X1EnbYPj0aNHYuzYsQKA+PbbbxUx3rx5U1y/fl0AELNnz1Y6F4mJicLLy0t07txZ4/m6e/eukEgkYsqUKSrPBQUFiYYNGyr2VbhwYVGwYEHx448/il9++UWMGTNGVKpUSW3DL+15bdasmQAg5s2bJ1auXCnOnTun+F0VKVJEeHl5iSFDhoj58+eLvXv36ny+pVKp+PTTT4WdnZ347rvvxOzZs8Vnn30mQkNDM9V4M+Y9SQgh3rx5I0qXLi3s7e1Ft27dxLx588S4ceNEpUqVxJkzZ4QQQrRt21bky5dP5Z/3KVOmCIlEIu7evavxHJMM2zVs17Bdw3aNptfEdg3bNUKwXcN2jQyTUlZk5cqVKg0pITJuvG3atEkAELNmzdK6f/mnMtOmTVOUJSYminLlyom8efMq3gArV64UdnZ24sCBA0rbz58/XwAQhw4
"text/plain": [
"<Figure size 1200x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, axes = plt.subplots(1, 2, figsize=(12,5), sharey=True)\n",
"\n",
"# --- Graphique 1 ---\n",
"for name, g in dfc[~dfc['cluster_kmeans'].isin([2.0, 3.0])].groupby(\"cluster\"):\n",
" axes[0].scatter(g[\"frequency\"], g[\"rel_intensity_total\"], s=10, label=name)\n",
"\n",
"axes[0].set_yscale(\"log\")\n",
"axes[0].axvline(thr_freq, linestyle=\"--\")\n",
"axes[0].axhline(thr_int, linestyle=\"--\")\n",
"axes[0].set_xlabel(\"Activity frequency\")\n",
"axes[0].set_ylabel(\"Gross flow / mean AUM\")\n",
"axes[0].set_title(\"2D behavioral segmentation: relative intensity vs frequency\")\n",
"axes[0].set_ylim(0.1,100)\n",
"axes[0].legend(markerscale=2)\n",
"\n",
"# --- Graphique 2 ---\n",
"for name, g in dfc[dfc['cluster_kmeans']==2.0].groupby(\"cluster\"):\n",
" axes[1].scatter(\n",
" g[\"frequency\"], g[\"rel_intensity_total\"],\n",
" s=10,\n",
" label=name,\n",
" color=\"red\" # 👈 ici\n",
" )\n",
"\n",
"axes[1].set_yscale(\"log\")\n",
"axes[1].axvline(thr_freq, linestyle=\"--\")\n",
"axes[1].axhline(thr_int, linestyle=\"--\")\n",
"axes[1].set_xlabel(\"Activity frequency\")\n",
"axes[1].set_title(\"2D behavioral segmentation: relative intensity vs frequency\")\n",
"axes[1].legend(markerscale=2)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"id": "64176145-ee5b-4ea8-98ed-c155146dd2f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>month</th>\n",
" <th>aum_qty</th>\n",
" <th>net_flow_qty</th>\n",
" <th>gross_flow_qty</th>\n",
" <th>n_tx</th>\n",
" <th>active_month</th>\n",
" <th>rel_intensity_m</th>\n",
" <th>netflow_to_aum_m</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18872</td>\n",
" <td>2015-01-31</td>\n",
" <td>179864.637</td>\n",
" <td>-1524.010</td>\n",
" <td>15230.010</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" <td>0.084675</td>\n",
" <td>-0.008473</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18872</td>\n",
" <td>2015-02-28</td>\n",
" <td>186761.736</td>\n",
" <td>7247.100</td>\n",
" <td>18571.880</td>\n",
" <td>38</td>\n",
" <td>1</td>\n",
" <td>0.099442</td>\n",
" <td>0.038804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>18872</td>\n",
" <td>2015-03-31</td>\n",
" <td>190357.718</td>\n",
" <td>3655.380</td>\n",
" <td>9754.040</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>0.051241</td>\n",
" <td>0.019203</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18872</td>\n",
" <td>2015-04-30</td>\n",
" <td>191429.324</td>\n",
" <td>-218.394</td>\n",
" <td>12840.950</td>\n",
" <td>39</td>\n",
" <td>1</td>\n",
" <td>0.067079</td>\n",
" <td>-0.001141</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>18872</td>\n",
" <td>2015-05-31</td>\n",
" <td>189056.475</td>\n",
" <td>-4782.849</td>\n",
" <td>6332.849</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>0.033497</td>\n",
" <td>-0.025299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33967</th>\n",
" <td>422874</td>\n",
" <td>2025-06-30</td>\n",
" <td>55540.077</td>\n",
" <td>1303.393</td>\n",
" <td>1303.393</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>0.023468</td>\n",
" <td>0.023468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33968</th>\n",
" <td>422874</td>\n",
" <td>2025-07-31</td>\n",
" <td>55179.460</td>\n",
" <td>-1013.363</td>\n",
" <td>2066.489</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>0.037450</td>\n",
" <td>-0.018365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33969</th>\n",
" <td>422874</td>\n",
" <td>2025-08-31</td>\n",
" <td>56928.472</td>\n",
" <td>1749.012</td>\n",
" <td>2010.564</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0.035317</td>\n",
" <td>0.030723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33970</th>\n",
" <td>422874</td>\n",
" <td>2025-09-30</td>\n",
" <td>57042.358</td>\n",
" <td>113.886</td>\n",
" <td>3895.248</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>0.068287</td>\n",
" <td>0.001997</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33971</th>\n",
" <td>422874</td>\n",
" <td>2025-10-31</td>\n",
" <td>56522.708</td>\n",
" <td>-555.680</td>\n",
" <td>1619.142</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>0.028646</td>\n",
" <td>-0.009831</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>33972 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID month aum_qty net_flow_qty \\\n",
"0 18872 2015-01-31 179864.637 -1524.010 \n",
"1 18872 2015-02-28 186761.736 7247.100 \n",
"2 18872 2015-03-31 190357.718 3655.380 \n",
"3 18872 2015-04-30 191429.324 -218.394 \n",
"4 18872 2015-05-31 189056.475 -4782.849 \n",
"... ... ... ... ... \n",
"33967 422874 2025-06-30 55540.077 1303.393 \n",
"33968 422874 2025-07-31 55179.460 -1013.363 \n",
"33969 422874 2025-08-31 56928.472 1749.012 \n",
"33970 422874 2025-09-30 57042.358 113.886 \n",
"33971 422874 2025-10-31 56522.708 -555.680 \n",
"\n",
" gross_flow_qty n_tx active_month rel_intensity_m netflow_to_aum_m \n",
"0 15230.010 32 1 0.084675 -0.008473 \n",
"1 18571.880 38 1 0.099442 0.038804 \n",
"2 9754.040 47 1 0.051241 0.019203 \n",
"3 12840.950 39 1 0.067079 -0.001141 \n",
"4 6332.849 24 1 0.033497 -0.025299 \n",
"... ... ... ... ... ... \n",
"33967 1303.393 5 1 0.023468 0.023468 \n",
"33968 2066.489 9 1 0.037450 -0.018365 \n",
"33969 2010.564 3 1 0.035317 0.030723 \n",
"33970 3895.248 7 1 0.068287 0.001997 \n",
"33971 1619.142 6 1 0.028646 -0.009831 \n",
"\n",
"[33972 rows x 9 columns]"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Analyse temporelle \n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 130,
"id": "17787765-a4bb-4e23-80f0-92c91d16f1cc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>month</th>\n",
" <th>aum_qty</th>\n",
" <th>net_flow_qty</th>\n",
" <th>gross_flow_qty</th>\n",
" <th>n_tx</th>\n",
" <th>active_month</th>\n",
" <th>rel_intensity_m</th>\n",
" <th>netflow_to_aum_m</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18872</td>\n",
" <td>2015-01-31</td>\n",
" <td>179864.637</td>\n",
" <td>-1524.010</td>\n",
" <td>15230.010</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" <td>8.467484e-02</td>\n",
" <td>-8.473094e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27192</th>\n",
" <td>365377</td>\n",
" <td>2015-01-31</td>\n",
" <td>0.000</td>\n",
" <td>3640.020</td>\n",
" <td>7687.660</td>\n",
" <td>63</td>\n",
" <td>1</td>\n",
" <td>7.687660e+12</td>\n",
" <td>3.640020e+12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>650</th>\n",
" <td>200000201</td>\n",
" <td>2015-01-31</td>\n",
" <td>17072.819</td>\n",
" <td>-494.780</td>\n",
" <td>800.440</td>\n",
" <td>20</td>\n",
" <td>1</td>\n",
" <td>4.688388e-02</td>\n",
" <td>-2.898057e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25893</th>\n",
" <td>365172</td>\n",
" <td>2015-01-31</td>\n",
" <td>67707.000</td>\n",
" <td>11917.000</td>\n",
" <td>11957.000</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1.765992e-01</td>\n",
" <td>1.760084e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33712</th>\n",
" <td>422691</td>\n",
" <td>2015-01-31</td>\n",
" <td>60705.316</td>\n",
" <td>3724.160</td>\n",
" <td>6372.040</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>1.049668e-01</td>\n",
" <td>6.134817e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26152</th>\n",
" <td>365236</td>\n",
" <td>2025-10-31</td>\n",
" <td>74195.145</td>\n",
" <td>-29100.206</td>\n",
" <td>32046.852</td>\n",
" <td>98</td>\n",
" <td>1</td>\n",
" <td>4.319265e-01</td>\n",
" <td>-3.922117e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13522</th>\n",
" <td>200127403</td>\n",
" <td>2025-10-31</td>\n",
" <td>17711.000</td>\n",
" <td>197.000</td>\n",
" <td>197.000</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1.112303e-02</td>\n",
" <td>1.112303e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13593</th>\n",
" <td>200127404</td>\n",
" <td>2025-10-31</td>\n",
" <td>44881.000</td>\n",
" <td>3099.500</td>\n",
" <td>3539.500</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>7.886411e-02</td>\n",
" <td>6.906040e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18584</th>\n",
" <td>200128363</td>\n",
" <td>2025-10-31</td>\n",
" <td>7491447.864</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33971</th>\n",
" <td>422874</td>\n",
" <td>2025-10-31</td>\n",
" <td>56522.708</td>\n",
" <td>-555.680</td>\n",
" <td>1619.142</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>2.864587e-02</td>\n",
" <td>-9.831093e-03</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>33972 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID month aum_qty net_flow_qty \\\n",
"0 18872 2015-01-31 179864.637 -1524.010 \n",
"27192 365377 2015-01-31 0.000 3640.020 \n",
"650 200000201 2015-01-31 17072.819 -494.780 \n",
"25893 365172 2015-01-31 67707.000 11917.000 \n",
"33712 422691 2015-01-31 60705.316 3724.160 \n",
"... ... ... ... ... \n",
"26152 365236 2025-10-31 74195.145 -29100.206 \n",
"13522 200127403 2025-10-31 17711.000 197.000 \n",
"13593 200127404 2025-10-31 44881.000 3099.500 \n",
"18584 200128363 2025-10-31 7491447.864 0.000 \n",
"33971 422874 2025-10-31 56522.708 -555.680 \n",
"\n",
" gross_flow_qty n_tx active_month rel_intensity_m netflow_to_aum_m \n",
"0 15230.010 32 1 8.467484e-02 -8.473094e-03 \n",
"27192 7687.660 63 1 7.687660e+12 3.640020e+12 \n",
"650 800.440 20 1 4.688388e-02 -2.898057e-02 \n",
"25893 11957.000 4 1 1.765992e-01 1.760084e-01 \n",
"33712 6372.040 24 1 1.049668e-01 6.134817e-02 \n",
"... ... ... ... ... ... \n",
"26152 32046.852 98 1 4.319265e-01 -3.922117e-01 \n",
"13522 197.000 4 1 1.112303e-02 1.112303e-02 \n",
"13593 3539.500 15 1 7.886411e-02 6.906040e-02 \n",
"18584 0.000 0 0 0.000000e+00 0.000000e+00 \n",
"33971 1619.142 6 1 2.864587e-02 -9.831093e-03 \n",
"\n",
"[33972 rows x 9 columns]"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_month.sort_values(by=\"month\", ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c99453ef-040e-44f1-ac8a-4c3754472232",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# 1. Définir les fenêtres temporelles (ex: 3 ans glissants)\n",
"windows = [\n",
" (2017, 2019),\n",
" (2020, 2022),\n",
" (2023, 2025)\n",
"]\n",
"\n",
"stability_results = []\n",
"\n",
"for start, end in windows:\n",
" # FILTRAGE : On recalcule les variables sur la période (simulation)\n",
" # Note : Tu dois adapter cette partie à tes données brutes par date\n",
" df_period = df_month[(df_month['year'] >= start) & (df_month['year'] <= end)]\n",
" \n",
" # Recalcul des 6 features pour cette période spécifique\n",
" # (Exemple simplifié, remplace par ton code d'agrégation)\n",
" X_period = df_period.groupby(ID_COL).agg({\n",
" \"aum_qty_mean\": \"mean\",\n",
" \"gross_flow_qty_mean\": \"mean\",\n",
" \"flow_freq\": \"mean\",\n",
" \"gross_flow_to_aum\": \"mean\",\n",
" \"net_flow_qty_vol\": \"std\",\n",
" \"n_tx_total\": \"sum\"\n",
" }).dropna()\n",
" \n",
" # Transformation LOG (comme dans ton clustering initial)\n",
" X_period[\"log_aum_qty_mean\"] = np.log1p(X_period[\"aum_qty_mean\"])\n",
" X_period[\"log_gross_flow_qty_mean\"] = np.log1p(X_period[\"gross_flow_qty_mean\"])\n",
" X_period = X_period[features] # On garde les 6 bonnes colonnes\n",
" \n",
" # 2. PROJECTION : On utilise le scaler et le kmeans déjà entraînés\n",
" X_scaled_period = scaler.transform(X_period)\n",
" X_period['assigned_cluster'] = km.predict(X_scaled_period)\n",
" \n",
" # 3. STOCKAGE : On garde la taille de chaque cluster pour cette période\n",
" counts = X_period['assigned_cluster'].value_counts().to_dict()\n",
" counts['period'] = f\"{start}-{end}\"\n",
" stability_results.append(counts)\n",
"\n",
"# Visualisation rapide de l'évolution des tailles\n",
"df_stability = pd.DataFrame(stability_results).set_index('period').fillna(0)\n",
"df_stability.plot(kind='bar', stacked=True, figsize=(10, 6))\n",
"plt.title(\"Évolution de la taille des clusters dans le temps\")\n",
"plt.ylabel(\"Nombre de clients\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "0ff98dd4-cd21-443a-a603-6dbdee066b87",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered clients: (154, 18)\n",
"Filtered clients: (355, 18)\n",
"Filtered clients: (421, 18)\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# 1. Définir les fenêtres temporelles (ex: 3 ans glissants)\n",
"windows = [\n",
" (\"2016-10-31\", \"2019-10-31\"),\n",
" (\"2019-10-31\", \"2022-10-31\"),\n",
" (\"2022-10-31\", \"2025-10-31\")\n",
"]\n",
"\n",
"stability_results = []\n",
"\n",
"for start, end in windows:\n",
" # FILTRAGE : On recalcule les variables sur la période (simulation)\n",
" # Note : Tu dois adapter cette partie à tes données brutes par date\n",
" df_period = df_month[(df_month['month'] > start) & (df_month['month'] <= end)].copy()\n",
"\n",
" eps = 1e-9 \n",
"\n",
" # 1) Active month indicator: did the client trade this month?\n",
" df_period[\"active_month\"] = (df_period[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
" #client avec beaucoup de mois à 0 → “stable / dormant”\n",
" #client actif presque tous les mois → “rebalancer / institutionnel actif”\n",
"\n",
"\n",
" # 2) Monthly relative intensity (turnover proxy in quantity terms) : Mesurer lintensité de trading relativement à la taille et pouvoir ocmparer client petit avec client plus gros\n",
" df_period[\"rel_intensity_m\"] = df_period[\"gross_flow_qty\"] / (df_period[\"aum_qty\"].abs() + eps)\n",
"\n",
" # 3) Monthly net flow ratio (directional change): sert a Capturer la direction de la dynamique\n",
" df_period[\"netflow_to_aum_m\"] = df_period[\"net_flow_qty\"] / (df_period[\"aum_qty\"].abs() + eps)\n",
"\n",
" # 4) Aggregate to client-level features (1 row per client)\n",
" dft_client_feat = (\n",
" df_period.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" # Coverage / activity\n",
" n_months=(\"month\", \"nunique\"),\n",
" n_active_months=(\"active_month\", \"sum\"),\n",
" flow_freq=(\"active_month\", \"mean\"),\n",
"\n",
" # Size in quantity terms\n",
" aum_qty_mean=(\"aum_qty\", \"mean\"),\n",
" aum_qty_median=(\"aum_qty\", \"median\"),\n",
"\n",
" # Flows in quantity terms\n",
" net_flow_qty_sum=(\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty_sum=(\"gross_flow_qty\", \"sum\"),\n",
" gross_flow_qty_mean=(\"gross_flow_qty\", \"mean\"),\n",
"\n",
" # Dispersion / volatility proxy\n",
" net_flow_qty_vol=(\"net_flow_qty\", \"std\"),\n",
" rel_intensity=(\"rel_intensity_m\", \"mean\"),\n",
" netflow_to_aum=(\"netflow_to_aum_m\", \"mean\"),\n",
"\n",
" # Trading frequency proxy\n",
" n_tx_total=(\"n_tx\", \"sum\"),\n",
" )\n",
")\n",
"\n",
" # 5) Clean NaNs due to std on constant series\n",
" dft_client_feat[\"net_flow_qty_vol\"] = dft_client_feat[\"net_flow_qty_vol\"].fillna(0.0)\n",
"\n",
" # 6) Log transforms (useful because distributions are heavy-tailed)\n",
" dft_client_feat[\"log_aum_qty_mean\"] = np.log1p(dft_client_feat[\"aum_qty_mean\"].clip(lower=0))\n",
" dft_client_feat[\"log_gross_flow_qty_mean\"] = np.log1p(dft_client_feat[\"gross_flow_qty_mean\"].clip(lower=0))\n",
"\n",
" # 7) Global turnover proxy\n",
" dft_client_feat[\"gross_flow_to_aum\"] = dft_client_feat[\"gross_flow_qty_sum\"] / (dft_client_feat[\"aum_qty_mean\"].abs() + eps)\n",
"\n",
" dfct = dft_client_feat.copy()\n",
"\n",
" # Minimal filters (adjust if needed)\n",
" dfct = dfct[(dfct[\"n_months\"] >= 6)] # at least 6 observed months\n",
" dfct = dfct[(dfct[\"aum_qty_mean\"].abs() > 0)] # avoid zero holdings\n",
"\n",
" dfct[\"rel_intensity_total\"] = dfct[\"gross_flow_to_aum\"] # turnover proxy\n",
" dfct[\"frequency\"] = dfct[\"flow_freq\"] \n",
" \n",
" if start == \"2016-10-31\":\n",
" df_2016 = dfct.copy()\n",
" if start == \"2019-10-31\":\n",
" df_2019 = dfct.copy()\n",
" if start == \"2022-10-31\":\n",
" df_2022 = dfct.copy()\n",
" print(\"Filtered clients:\", dfct.shape)"
]
},
{
"cell_type": "code",
"execution_count": 180,
"id": "39bcd74a-5828-47d5-9709-a1c1eb4540f7",
"metadata": {},
"outputs": [],
"source": [
"ids_2016 = set(df_2016['Registrar Account - ID'])\n",
"ids_2019 = set(df_2019['Registrar Account - ID'])\n",
"ids_2022 = set(df_2022['Registrar Account - ID'])\n",
"\n",
"common_ids = ids_2016 & ids_2019 & ids_2022\n",
"\n",
"common_id = ids_2019 & ids_2022\n",
"\n",
"df_2016_common = df_2016[df_2016['Registrar Account - ID'].isin(common_ids)].copy()\n",
"df_2019_common = df_2019[df_2019['Registrar Account - ID'].isin(common_ids)].copy()\n",
"df_2022_common = df_2022[df_2022['Registrar Account - ID'].isin(common_ids)].copy()\n",
"\n",
"df_2019_common2 = df_2019[df_2019['Registrar Account - ID'].isin(common_id)].copy()\n",
"df_2022_common2 = df_2022[df_2022['Registrar Account - ID'].isin(common_id)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 181,
"id": "c9526723-3aaf-4a7a-961a-a426e8aa7a9b",
"metadata": {},
"outputs": [],
"source": [
"# Evolution des clusters dans le temps\n",
"\n",
"X_2016 =(df_2016_common[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"X_2019 = (df_2019_common[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"X_2022 = (df_2022_common[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"\n",
"X_2019_2 = (df_2019_common2[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"X_2022_2 = (df_2022_common2[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"\n",
"X_2016_scaled = scaler.transform(X_2016)\n",
"X_2019_scaled = scaler.transform(X_2019)\n",
"X_2022_scaled = scaler.transform(X_2022)\n",
"\n",
"X_2019_scaled2 = scaler.transform(X_2019_2)\n",
"X_2022_scaled2 = scaler.transform(X_2022_2)\n",
"\n",
"labels_2016 = km.predict(X_2016_scaled)\n",
"labels_2019 = km.predict(X_2019_scaled)\n",
"labels_2022 = km.predict(X_2022_scaled)\n",
"\n",
"labels_2019_2 = km.predict(X_2019_scaled2)\n",
"labels_2022_2 = km.predict(X_2022_scaled2)"
]
},
{
"cell_type": "code",
"execution_count": 182,
"id": "85fde921-cad6-4528-b27f-261522304e33",
"metadata": {},
"outputs": [],
"source": [
"df_2016_common[\"cluster_kmeans\"] = labels_2016\n",
"df_2019_common[\"cluster_kmeans\"] = labels_2019\n",
"df_2022_common[\"cluster_kmeans\"] = labels_2022\n",
"\n",
"df_2019_common2[\"cluster_kmeans\"] = labels_2019_2\n",
"df_2022_common2[\"cluster_kmeans\"] = labels_2022_2"
]
},
{
"cell_type": "code",
"execution_count": 208,
"id": "6a1351c8-5819-493e-80b6-7274bc6d8b03",
"metadata": {},
"outputs": [],
"source": [
"# Définition de la correspondance entre les codes numériques et les nouveaux labels\n",
"mapping = {\n",
" 0.0: 1,\n",
" 1.0: 2,\n",
" 2.0: 3,\n",
" 3.0: 4,\n",
" 4.0: 5\n",
"}\n",
"\n",
"# Création de la nouvelle colonne 'cluster'\n",
"df_2016_common['cluster'] = df_2016_common['cluster_kmeans'].map(mapping)\n",
"df_2019_common['cluster'] = df_2019_common['cluster_kmeans'].map(mapping)\n",
"df_2022_common['cluster'] = df_2022_common['cluster_kmeans'].map(mapping)\n",
"\n",
"df_2019_common2['cluster'] = df_2019_common2['cluster_kmeans'].map(mapping)\n",
"df_2022_common2['cluster'] = df_2022_common2['cluster_kmeans'].map(mapping)\n",
"\n",
"clusters_keep = [1, 2, 3]\n",
"\n",
"df_2016_f = df_2016_common[df_2016_common[\"cluster\"].isin(clusters_keep)]\n",
"df_2019_f = df_2019_common[df_2019_common[\"cluster\"].isin(clusters_keep)]\n",
"df_2022_f = df_2022_common[df_2022_common[\"cluster\"].isin(clusters_keep)]\n",
"\n",
"df_2019_f2 = df_2019_common2[df_2019_common2[\"cluster\"].isin(clusters_keep)]\n",
"df_2022_f2 = df_2022_common2[df_2022_common2[\"cluster\"].isin(clusters_keep)]"
]
},
{
"cell_type": "code",
"execution_count": 211,
"id": "c50a2b85-6124-4df2-8b92-590bbc4865ba",
"metadata": {},
"outputs": [],
"source": [
"# Merge\n",
"\n",
"df_evo = (\n",
" df_2016_f[[ID_COL, \"cluster\"]]\n",
" .rename(columns={\"cluster\": \"cluster_2016\"})\n",
" .merge(\n",
" df_2019_f[[ID_COL, \"cluster\"]]\n",
" .rename(columns={\"cluster\": \"cluster_2019\"}),\n",
" on=ID_COL\n",
" )\n",
" .merge(\n",
" df_2022_f[[ID_COL, \"cluster\"]]\n",
" .rename(columns={\"cluster\": \"cluster_2022\"}),\n",
" on=ID_COL\n",
" )\n",
")\n",
"\n",
"df_evo2 = (\n",
" df_2019_f2[[ID_COL, \"cluster\"]]\n",
" .rename(columns={\"cluster\": \"cluster_2019\"})\n",
" .merge(\n",
" df_2022_f2[[ID_COL, \"cluster\"]]\n",
" .rename(columns={\"cluster\": \"cluster_2022\"}),\n",
" on=ID_COL\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 212,
"id": "9d78a9ee-8b17-457d-8b4c-8f93ad1584b3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>cluster_2019</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster_2016</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0</td>\n",
" <td>2.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>38.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" <td>82.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"cluster_2019 1 2 3\n",
"cluster_2016 \n",
"1 8.0 2.0 10.0\n",
"2 1.0 38.0 5.0\n",
"3 NaN 5.0 82.0"
]
},
"execution_count": 212,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_evo.groupby([\"cluster_2016\", \"cluster_2019\"]).size().unstack()"
]
},
{
"cell_type": "code",
"execution_count": 213,
"id": "9aaf2863-dbd2-43d1-861c-98ddd9354881",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>cluster_2022</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster_2019</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>7.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>59.0</td>\n",
" <td>15.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6.0</td>\n",
" <td>16.0</td>\n",
" <td>206.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"cluster_2022 1 2 3\n",
"cluster_2019 \n",
"1 34.0 2.0 7.0\n",
"2 NaN 59.0 15.0\n",
"3 6.0 16.0 206.0"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_evo2.groupby([\"cluster_2019\", \"cluster_2022\"]).size().unstack()"
]
},
{
"cell_type": "code",
"execution_count": 218,
"id": "82e96b64-0d48-4bbf-b334-0dcb311440d0",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoAAAAIjCAYAAACTRapjAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAYqRJREFUeJzt3XdclfX///HnQeGALDeIA3Ei7lyR2xQ0c6Rm2dDUbJFmlpWZOVLRylWOlpGaNqy0Mkdu85Oai3LnKksFNwgoGFy/P/pxvp4APSjHg1yPe7frduu8r/e5rtd1WC9f7/f1viyGYRgCAACAabi5OgAAAADcWiSAAAAAJkMCCAAAYDIkgAAAACZDAggAAGAyJIAAAAAmQwIIAABgMiSAAAAAJkMCCAAAYDIkgHDIqFGjZLFYXB2GJOmTTz6RxWLRH3/84epQbrncfB3y4+dksVg0atQoV4eRRX6N61Z45pln1K5dO1eHgZuwfPly+fj46PTp064OBbcREsB8JvOPtsVi0caNG7PsNwxD5cuXl8Vi0b333ntD5xg/frwWL158k5EWHHv37tWoUaPyVaKUG3w987cTJ05o1KhRio2NdXUoWRw9elQfffSRXn31VVvbX3/9pdGjR6tx48YqVqyYSpYsqVatWmnVqlXZHuPChQt64oknVKpUKXl7e6t169basWNHln5ffPGFHnnkEVWtWlUWi0WtWrW6Zmw7duxQ586dVbx4cRUpUkS1atXSO++8c1PXm1urV69Wv379VK1aNRUpUkSVKlXS448/rpMnT2bb/+eff1azZs1UpEgRBQYGatCgQUpKSrLrk5SUpJEjR6p9+/YqXry4LBaLPvnkkxxjmD59umrUqCGr1aqyZctqyJAhSk5OtuvTvn17ValSRdHR0Td9zTARA/lKTEyMIcnw9PQ0nn766Sz7165da0gyrFar0bFjxxs6h7e3t9GnT59cvefKlSvGpUuXbuh8eS3zMzp69GieHG/hwoWGJGPt2rV5cjxnyu7rkNPX859//jEuXbpkZGRk3KLork+SMXLkSFeHkYUz49q6dashyYiJiXHK8W/Gc889Z1SrVs2u7d133zW8vLyMXr16GdOnTzemTp1q3HHHHYYk4+OPP7brm56ebtx1112Gt7e3MWrUKGP69OlGWFiY4evra/z+++92fVu2bGn4+PgYrVu3NooVK2a0bNkyx7hWrFhheHh4GE2aNDEmT55sfPDBB8bLL79sDB06NM+u3RENGjQwQkJCjJdeesn48MMPjWHDhhm+vr5GQECAcfLkSbu+O3fuNDw9PY369esbs2bNMoYPH25YrVajffv2dv2OHj1qSDIqVKhgtGrV6prfGy+99JIhyejRo4cxa9YsY+DAgUbhwoWNiIiILH1nzpxpFClSxEhMTMyz60fBRgKYz2QmN926dTNKlixpXLlyxW7/gAEDjAYNGhjBwcG3JAFMSkq6oXM4kysTwIyMDCMlJSVPzptXbiShdxUSwLxzsz+baWlpRsmSJY3XXnvNrn337t3G6dOn7douX75shIaGGuXKlbNr/+KLLwxJxsKFC21tp06dMooWLWr06tXLru+xY8eM9PR0wzAMo2bNmjkmgAkJCUZAQIBx33332fq7yvr167PEsH79ekOSMXz4cLv2Dh06GGXKlDESEhJsbR9++KEhyVixYoWt7fLly7bk8VrfGydOnDAKFy5sPProo3bt7777riHJ+O677+za4+PjjUKFChmzZ8++oWuF+TAEnE/16tVLZ8+e1cqVK21taWlp+uqrr/TQQw9l+563335bd911l0qUKCEvLy81aNBAX331lV0fi8Wi5ORkzZkzxzbU/Nhjj0n6v/lle/fu1UMPPaRixYqpWbNmdvv+69NPP1Xjxo1VpEgRFStWTC1atNCPP/5o12fZsmVq3ry5vL295evrq44dO2rPnj0OfQ579uxRmzZt5OXlpXLlymns2LHKyMjItu+NnOeTTz7R/fffL0lq3bq17TNZt26dJKlixYq69957tWLFCjVs2FBeXl56//33JUkxMTFq06aNSpcuLavVqrCwMM2aNSvLOTKPsXHjRjVu3Fienp6qVKmS5s6da9fvypUrGj16tKpWrSpPT0+VKFFCzZo1s/se+O/X4Vpfz5zmAM6cOVM1a9aU1WpVUFCQoqKidOHCBbs+rVq1Uq1atbR37161bt1aRYoUUdmyZfXmm29e8/PMlJqaqueff16lSpWSr6+vOnfurL///jvbvsePH1e/fv0UEBAgq9WqmjVr6uOPP87S791331XNmjVt32sNGzbUggULrhvL5cuXNWrUKFWrVk2enp4qU6aMunXrpsOHD+f4nscee0wVK1bM0p7dz8HKlSvVrFkzFS1aVD4+PqpevbptSHXdunVq1KiRJKlv3762r9HVQ35btmxR+/bt5e/vryJFiqhly5b63//+l+15s/vZjIuLU9++fVWuXDlZrVaVKVNGXbp0ue6Uho0bN+rMmTNq27atXXvNmjVVsmRJuzar1ap77rlHf//9ty5evGhr/+qrrxQQEKBu3brZ2kqVKqWePXvq22+/VWpqqq29fPnycnO7/p+cBQsWKD4+XuPGjZObm5uSk5Nz/Jl3thYtWmSJuUWLFipevLj27dtna0tMTNTKlSv1yCOPyM/Pz9beu3dv+fj46Msvv7S1Wa1WBQYGXvfcmzZt0j///KMHH3zQrj3z9eeff27XXrp0adWpU0fffvut4xcIUyMBzKcqVqyo8PBwffbZZ7a2ZcuWKSEhIcsvhEzTpk1T/fr1NWbMGI0fP16FCxfW/fffrx9++MHWZ968ebJarWrevLnmzZunefPm6cknn7Q7zv3336+UlBSNHz9eAwYMyDHG0aNH69FHH5W7u7vGjBmj0aNHq3z58lqzZo3d+Tp27CgfHx9NnDhRI0aM0N69e9WsWbPr/oGKi4tT69atFRsbq1deeUWDBw/W3LlzNW3atCx9b/Q8LVq00KBBgyRJr776qu0zqVGjhq3PgQMH1KtXL7Vr107Tpk1TvXr1JEmzZs1ScHCwXn31VU2aNEnly5fXM888oxkzZmQ5z6FDh9SjRw+1a9dOkyZNUrFixfTYY4/ZJaijRo3S6NGj1bp1a02fPl3Dhw9XhQoVsp1PdfV1X+/rebVRo0YpKipKQUFBmjRpkrp37673339fERERunLlil3f8+fPq3379qpbt64mTZqk0NBQvfzyy1q2bFmOx8/0+OOPa+rUqYqIiNCECRPk7u6ujh07ZukXHx+vO++8U6tWrdKzzz6radOmqUqVKurfv7+mTp1q6/fhhx9q0KBBCgsL09SpUzV69GjVq1dPW7ZsuWYc6enpuvfeezV69Gg1aNBAkyZN0nPPPaeEhATt3r37utdxPXv27NG9996r1NRUjRkzRpMmTVLnzp1tCVyNGjU0ZswYSdITTzxh+xq1aNFCkrRmzRq1aNFCiYmJGjlypMaPH68LFy6oTZs2+uWXX7KcL7ufze7du2vRokXq27evZs6cqUGDBunixYs6duzYNWP/+eefZbFYVL9+fYeuNS4uTkWKFFGRIkVsbTt37tQdd9yRJUlq3LixUlJS9Pvvvzt07KutWrVKfn5+On78uKpXry4fHx/5+fnp6aef1uXLl3N9vKtdvnxZPXr00LZt2274GElJSUpKSrJLknft2qV//vlHDRs2tOvr4eGhevXqaefOnbk+T2by7OXlZdee+flv3749y3saNGign3/+Odfngkm5ugQJe5nDm1u3bjWmT59u+Pr62oYc77//fqN169aGYRjZDgH/d2gyLS3NqFWrltGmTRu79pyGDEeOHGlIyjJ0c/W+TAcPHjTc3NyyHabJnHN28eJFo2jRosaAAQPs9sfFxRn+/v5Z2v9r8ODBhiRjy5YttrZTp04Z/v7+dkPAN3ueaw0BBwcHG5KM5cuXZ9mX3VBwZGSkUalSpWyPsWHDBrvrsFqtxgsvvGBrq1u37nWH9f/7dTCMnL+e/x0qP3XqlOHh4WFERETYfc2mT5+eZX5Xy5YtDUnG3LlzbW2pqalGYGCg0b1
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# matrice de transition\n",
"transition1 = df_evo.groupby([\"cluster_2016\", \"cluster_2019\"]).size().unstack(fill_value=0)\n",
"\n",
"plt.figure(figsize=(8,6))\n",
"sns.heatmap(transition1, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
"\n",
"plt.ylabel(\"Cluster 2016-2019\")\n",
"plt.xlabel(\"Cluster 2019-2022\")\n",
"plt.title(\"Matrice de transition des clusters (2016 → 2019)\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 219,
"id": "aac4fc2b-8f5c-49d5-aaa6-8c563dfc0ef3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoUAAAIjCAYAAAB1bGEnAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAYjtJREFUeJzt3Xt8zvX/x/HntdnRZmObjZk5H5ZjTg0ZkVMRJUVFTvXVUBZFyik5RJFzEYVECOXMckgUOZPzsbCxYRg2ts/vD79dddk1ttm1y3jcv7frdvvu/Xl/3p/X53LFa6/3+/O+TIZhGAIAAMAjzcHeAQAAAMD+SAoBAABAUggAAACSQgAAAIikEAAAACIpBAAAgEgKAQAAIJJCAAAAiKQQsItx48Zp5syZ9g4DAAAzkkI8kEwmk7p165Zl433zzTcymUz6888/79m3bt26qlu3rvnnEydOyGQy6ZtvvjG3DRw4UCaTKVOxjBs3ToMHD9YTTzyRqfNTmEwmDRw48L7GyIms/Xncjb3fp7feektPP/203a6P+7dixQp5eHjo/Pnz9g4FsCmSQqRbSmKV8nJ1dVWpUqXUrVs3RUdH2zs8uxs6dKgWLVp01z5bt25V//799fPPP6tkyZLZE1g6pSf+B9WyZcseyAT5+PHjmjp1qj744ANz299//61BgwapevXqyps3r3x9fVW3bl2tWbPG6hiXLl3SG2+8IT8/P+XOnVv16tXT9u3bU/WbO3euXn31VZUsWVImk8niFxtrtm/frubNmytfvnxyd3dXuXLlNHbs2Pu634yKjIxUx44dVapUKbm7u6tYsWLq3Lmzzp49a7X/pk2bVLt2bbm7uysgIEA9evTQ1atXLfpcvXpVAwYMUOPGjZUvX757/gIxfvx4lS1bVi4uLgoMDFRERITi4+Mt+jRu3FglSpTQsGHD7vuegQeaAaTT9OnTDUnG4MGDjZkzZxpTpkwx2rdvbzg4OBhFixY14uPjs+xakozw8PAsGy8l9q1bt96zb0JCgpGQkGD++fjx44YkY/r06ea2mzdvGtevX7c4L3fu3Eb79u3vGUdkZGSGYk+LJGPAgAFZMpZhpC/+B0FycrJx/fp149atW+a28PBwI62/zq5fv27cvHkzu8Kz8PbbbxulSpWyaBs3bpzh5uZmtGnTxhg/frwxZswY4/HHHzckGdOmTbPom5SUZNSsWdPInTu3MXDgQGP8+PFGSEiI4enpaRw6dMiib1hYmOHh4WHUq1fPyJs3rxEWFpZmXCtXrjScnZ2NGjVqGJ9//rnx1VdfGe+//77Ru3fvLLv39KhSpYpRtGhR47333jOmTJli9O3b1/D09DT8/f2Ns2fPWvTdsWOH4erqalSuXNmYNGmS0a9fP8PFxcVo3LixRb+U/14LFy5s1K1bN9V/u//13nvvGZKMVq1aGZMmTTK6d+9u5MqVy2jYsGGqvhMnTjTc3d2Ny5cvZ9n9Aw8akkKkW1qJVUREhCHJmD17dprnXr16NUPXsmdSeCdrSaE12Z1U2TMpzOifp63dLSm0l8TERMPX19f48MMPLdr37t1rnD9/3qLtxo0bRpkyZYxChQpZtM+dO9eQZMybN8/cdu7cOcPb29to06aNRd9Tp04ZSUlJhmEYxmOPPZZmUhgXF2f4+/sbLVu2NPe3l/Xr16eKYf369YYko1+/fhbtTZo0MQoUKGDExcWZ26ZMmWJIMlauXGluu3Hjhjmh3Lp1a5r/7Z45c8bIlSuX8dprr1m0jxs3zpBk/PTTTxbt0dHRhqOjo/H1119n6l6BnIDpY9y3p556StLtqTJJev311+Xh4aGjR4+qadOm8vT01CuvvCJJio+P17vvvqugoCC5uLiodOnSGjVqlAzDsDr2d999p9KlS8vV1VVVqlTRhg0bLI6fPHlSb731lkqXLi03Nzf5+PjoxRdf1IkTJ6yOd+3aNb355pvy8fFRnjx51K5dO128eNGiz51rCq25c02hyWRSfHy8vv32W/P0+uuvv24+fvr0aXXs2FH+/v5ycXHRY489pmnTpt31GikSEhLUs2dP+fn5ydPTU82bN9c///xjtW9mr3O3+FPu9a+//lLbtm2VN29e1a5dW5K0e/duvf766ypWrJhcXV0VEBCgjh07KjY21ur7deTIEb3++uvy9vaWl5eXOnTooGvXrln0Xb16tWrXri1vb295eHiodOnSFtOvd64pfP311zVhwgTzfaS8/ntvd04t79ixQ02aNFGePHnk4eGh+vXr6/fff7fok7Jc4rffflNERIR5+rZly5bpWlu2ceNGxcTEqEGDBhbtjz32mHx9fS3aXFxc1LRpU/3zzz+6cuWKuX3+/Pny9/fX888/b27z8/NT69attXjxYiUkJJjbg4KC5OBw77/SZ8+erejoaH3yySdycHBQfHy8kpOT73meLdSpUydVzHXq1FG+fPm0f/9+c9vly5e1evVqvfrqq8qTJ4+5vV27dvLw8NAPP/xgbnNxcVFAQMA9r71582bdunVLL7/8skV7ys9z5syxaM+fP78qVKigxYsXp/8GgRwml70DQM539OhRSZKPj4+57datW2rUqJFq166tUaNGyd3dXYZhqHnz5lq7dq06deqkSpUqaeXKlerdu7dOnz6t0aNHW4y7fv16zZ07Vz169JCLi4smTpyoxo0ba8uWLSpXrpyk22v0Nm3apJdfflmFChXSiRMnNGnSJNWtW1d//fWX3N3dLcbs1q2bvL29NXDgQB08eFCTJk3SyZMntW7dukw/OCJJM2fOVOfOnVW9enW98cYbkqTixYtLkqKjo/XEE0+YH57x8/PT8uXL1alTJ12+fFnvvPPOXcfu3LmzZs2apbZt26pmzZr65Zdf9Mwzz6Tqdz/XuVv8KV588UWVLFlSQ4cONSfxq1ev1rFjx9ShQwcFBARo3759+uqrr7Rv3z79/vvvqd7T1q1bq2jRoho2bJi2b9+uqVOnKn/+/BoxYoQkad++fXr22WdVoUIFDR48WC4uLjpy5Ih+++23NGN/8803debMGa1evTpdT3Tv27dPTz75pPLkyaP33ntPTk5O+vLLL1W3bl2tX79eNWrUsOjfvXt35c2bVwMGDNCJEyc0ZswYdevWTXPnzr3rdTZt2iSTyaTKlSvfMyZJioqKkru7u8VndseOHXr88cdTJU7Vq1fXV199pUOHDql8+fLpGj/FmjVrlCdPHp0+fVotWrTQoUOHlDt3br322msaPXq0XF1dMzTef924cUOvvvqq+vTpo6pVq2ZqjKtXr+rq1asWifOePXt069atVGM6OzurUqVK2rFjR4avk5JQu7m5WbSnvP/btm1LdU6VKlVy7LpbIF3sXKlEDpIyBbtmzRrj/Pnzxt9//23MmTPH8PHxMdzc3Ix//vnHMAzDaN++vSHJ6NOnj8X5ixYtMiQZQ4YMsWhv1aqVYTKZjCNHjpjbJBmSjD///NPcdvLkScPV1dVo2bKlue3atWup4ty8ebMhyZgxY0aq2KtUqWIkJiaa2z/99FNDkrF48WJzW1hYmMXUm7Xp4wEDBqSarkxr+rVTp05GgQIFjJiYGIv2l19+2fDy8rJ6Dyl27txpSDLeeusti/a2bdummj6+n+vcLf6Ue71zutIwrL//33//vSHJ2LBhQ6oxOnbsaNG3ZcuWho+Pj/nn0aNHG5JSTa/+l7U/j7tNH9/5PrVo0cJwdnY2jh49am47c+aM4enpadSpU8fclvKZadCggZGcnGxu79mzp+Ho6GhcunQpzRgNwzBeffVVi3u7m8OHDxuurq6ppjJz586d6j0zDMNYunSpIclYsWKF1fHuNn1coUIFw93d3XB3dze6d+9uLFiwwOjevbshyXj55ZfTFW9arly5YoSGhhp58+Y1duzYkakxPv74Y0OSxdrbefPmpfpMpXjxxReNgIAAq2Pdbfp427ZthiTj448/tmhfsWKFIcnw8PBIdc7QoUMNSUZ0dHQG7wrIGZg+RoY1aNBAfn5+CgoK0ssvvywPDw8tXLhQgYGBFv26du1q8fOyZcvk6OioHj16WLS/++67MgxDy5c
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"transition1 = df_evo.groupby([\"cluster_2016\", \"cluster_2019\"]).size().unstack(fill_value=0)\n",
"transition2 = df_evo2.groupby([\"cluster_2019\", \"cluster_2022\"]).size().unstack(fill_value=0)\n",
"\n",
"transition_pct1 = transition1.div(transition1.sum(axis=1), axis=0)\n",
"transition_pct2 = transition2.div(transition2.sum(axis=1), axis=0)\n",
"\n",
"plt.figure(figsize=(8,6))\n",
"sns.heatmap(transition_pct2, annot=True, fmt=\".2f\", cmap=\"Blues\")\n",
"\n",
"plt.xlabel(\"Cluster 2019-2022\")\n",
"plt.ylabel(\"Cluster 2016-2019\")\n",
"plt.title(\"Probabilité de transition (2016 → 2019)\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"id": "0f5cc8dd-4f9b-4c66-9e21-951b05b506e0",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"link": {
"source": [
0,
0,
1,
1,
2,
2,
3
],
"target": [
4,
5,
5,
6,
5,
6,
6
],
"value": {
"bdata": "CQEgDQZcAQ==",
"dtype": "i1"
}
},
"node": {
"label": [
"2019_0",
"2019_1",
"2019_2",
"2019_4",
"2022_0",
"2022_1",
"2022_2",
"2022_4"
]
},
"type": "sankey"
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermap": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermap"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
}
}
},
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAt4AAAFoCAYAAABg7/yqAAAgAElEQVR4Xu2dCZgU1dWwz8AMDAzLgKiIOyBxS/iMye+fxGwmGmPyG7ckxhgVxQ0RNwTBhUVExBXcUMEYI6LGiKJ8xiW4JZovavyI2VwTt7iwDAyzMPtf55IzXorqmZ6p7uqe6befp5/urqp7b9V7a6bePn3q3qKW4CE8IAABCEAAAhCAAAQgAIGsEihCvLPKl8ohAAEIQAACEIAABCDgCCDenAgQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDenAMQgAAEIAABCEAAAhBIgADinQBkmoAABCAAAQhAAAIQgADizTkAAQhAAAIQgAAEIACBBAgg3glApgkIQAACEIAABCAAAQgg3pwDEIAABCAAAQhAAAIQSIAA4p0AZJqAAAQgAAEIQAACEIAA4s05AAEIQAACEIAABCAAgQQIIN4JQKYJCEAAAhCAAAQgAAEIIN6cAxCAAAQgAAEIQAACEEiAAOKdAGSagAAEIAABCEAAAhCAAOLNOQABCEAAAhCAAAQgAIEECCDeCUCmCQhAAAIQgAAEIAABCCDeXeQcqK7ZKI1NTTKwf9kWe9zS0iIV66ukZ48eMnDAluvbO8T1G6pd2X5lfdrbdIv1NbUbpXZjvQwI9qukuGeHy1MAAhCAAAQgkO8EmpqapaqmNuW1sr1rYVvX8Pr6BncdLS3tJb17lXQIRdzrf4caY+OMEEC8Y2CsqqqSt956K0YNWxYdMWKE9OvXr3XFMy+slHFTrm39vPvIneSWuefJkMED3bKK9RvkhydPkw8/Wes+H/2DA2TqhGOlZ88erWV0m4OOniiPLblKBpf3b12+LpD1c6bfIH985R9u2ec/O0qunzVBygd+2n5bB/fgb34nF85Z6DbpFfyzWHLTxaL7xwMCEIAABCDQHQio2M5f9IDcetfDrYej19kpE34qxT03BZvauha2dQ1vaGySY8ZdKn97/V+tdU+feIL88PvfSAtdOtf/tCpio0QJIN4xcK9cuVJuXviG9Os/PEYtnxbdUPm2fPvrLTJ8+PBAnHtKWVmZ/OGVN6S+sVkO2H8f2VjXIOOnzpMRuwyTOVPGSlGRyOTZC6W6pk7mXnSKrFqzTsace6VMPPUoOfCr+0jlhiqZOGuRvPHPD5wYL7t9pjQ31UtdXZ2Ulw+Ss6bdLD2CSPflQV11wTfu0yZf4+q+dOLxsn79OtmwYcMWx1VSUiKDBg2Sj1ZXygnnXOna3W+fPYJ/So/IQ48/L0sXTpeNtdVB+fXS3NycES5UAoFMEtC/LT3vw89MtkFdEIBA5gkUBRc9/bvVV3tGfc5Uy1p3r1695LqFS2X//7O37L37cHn73Q/llPOvlqlnHiNHfu9r8vrb78tPxs2Sqy45Tb60716y4M5lsjQISj1615zgV+Ae8punXpSaIJr9za/8l4tqnzHlOhm56/Zy1cWnSVV1jVx+wxIZ8+ODZeg2W8kTz74kc4LPd10/RYbvNFRqa2tFxT/80H0qLS2V82YuCOrYKFdPO10+WV0hx501R6aM/4l895tflI0bg1/JGxvl73//u7z0xAMydNCnQbc4fD6u0EDeyTJ69Og41RR0WcQ7RvereN95b5OUD/5cjFo+Lbp+7auBxL7pxFv/4Pv06SO9e/d2T32vf2jLArm9477H5N6bL5YNwR/t94+/SBbMOVv2/swurqK7H1whjz/zktw8e4LU19fJx2sqpb6hScZfOF8eWDhDpLlBGhoapLRPmRw+drpcO32c7LP3SFf2b2+8I2MnXi0PLpohzY11UlFRscUfve7HkCFDZOljL8iTz/5JFl090UXXa2rr5KBjJstNQbs7Dh0kq1atQrwzclZQCQQgAAEI5IJAcXGx9O3b18m3PjUYpsEnFd7hO20n444/VO564El54pmXI6+Fw3fcxgmwlglfw++af0GwrtZdJ/Xaru00N7fIIT+bIpdOGiMjd9om8jqqXzi22mor6VHcSw4dc0nk9f/GWeOD6/da0V/l3377bRm15jkZvcOAjCD88/uV0vK
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# Préparer les flux\n",
"flows = df_evo.groupby([\"cluster_2019\", \"cluster_2022\"]).size().reset_index(name=\"count\")\n",
"\n",
"labels = sorted(set(flows[\"cluster_2019\"]).union(set(flows[\"cluster_2022\"])))\n",
"\n",
"label_map = {k: i for i, k in enumerate(labels)}\n",
"\n",
"fig = go.Figure(data=[go.Sankey(\n",
" node=dict(label=[f\"2019_{l}\" for l in labels] + [f\"2022_{l}\" for l in labels]),\n",
" link=dict(\n",
" source=[label_map[s] for s in flows[\"cluster_2019\"]],\n",
" target=[label_map[t] + len(labels) for t in flows[\"cluster_2022\"]],\n",
" value=flows[\"count\"]\n",
" )\n",
")])\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 221,
"id": "4e139972-f4de-4f18-b9b3-601819f06d3f",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABUQAAAJOCAYAAAB/QA2/AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAo+ZJREFUeJzs3Xd4FNUax/HfpocaICRA6J0QilRDVxFEBEGRKiAiKhBFIwiRJqDSFJGuKEUEQbGh0gTBhoh06R0USSCBUEOA5Nw/9rKyZBMSSHaB/X7uM89zc+bM7Du7S3xzzjtnLMYYIwAAAAAAAABwAx6uDgAAAAAAAAAAnIUBUQAAAAAAAABugwFRAAAAAAAAAG6DAVEAAAAAAAAAboMBUQAAAAAAAABugwFRAAAAAAAAAG6DAVEAAAAAAAAAboMBUQAAAAAAAABugwFRAAAAAAAAAG6DAVE4zaxZs2SxWHTo0KEb9l29erUsFotWr16d5XGlV/HixfXUU0+5OowUbte4nGHMmDEqX768kpOTXR0KcMe699579eqrr7o6DADIMuSgWeN2jcsZyEGBW9e+fXu1bdvW1WHAjTEgeoeyWCzp2m6nZM6RKVOmaNasWa4OA6m4cOGCXn/99dvye3TmzBmNHj1a/fv3l4eH9VdZXFycxo4dqwYNGih//vwKCAjQvffeqwULFjg8R2Jiovr3769ChQrJ399ftWvX1g8//JCi3/Lly9W9e3eFhYXJ09NTxYsXTzO2/fv3q2PHjgoKCpK/v7/KlCmjgQMH3vCaLly4oMmTJ6tJkyYqWLCgcubMqXvuuUdTp05VUlJSiv7JyckaM2aMSpQoIT8/P1WuXFmffvppin7r1q1Tr169VL16dXl7e8tisaQaQ0xMjLp162aLvVq1avr8889vGHtm27Vrl1599VVVrVpVOXPmVMGCBdW8eXOtX7/eYf+jR4+qbdu2CggIUK5cufToo4/qwIEDKfpNnTpVTzzxhIoWLSqLxZLmH3I//PCD6tWrp2zZsilPnjxq06ZNuv6YlqS///5bw4YNU61atZQnTx4FBgaqUaNGWrFihcP+8fHxevbZZ5U/f35lz55d9913nzZu3GjXJyPf7z///FMRERGqWLGismfPrqJFi6pt27bas2dPir79+/fX5MmTFR0dna5rA+DeyEHhDOSgVhnJQfft26c2bdooT548ypYtm+rVq6dVq1al65qyIgdNTk7WrFmz1LJlSxUpUkTZs2dXWFiY3njjDV28eNGub0bzpqx0J+Vwjrj6s5RS/+/EqFGj7Pr1799fX3zxhbZs2ZKuawMyncEdac6cOXbbgw8+aCSlaI+OjnZ1qDZXrlwxCQkJJjk52dZWsWJF07BhwxR9k5KSTEJCgklKSnJihGkrVqyY6dq1q6vDSCEr4zpx4oSRZIYOHZol578V7777rsmVK5dJSEiwtX377bfG29vbPProo2b8+PFm0qRJ5r777jOSzJAhQ1Kco3379sbLy8v07dvXvP/++yY8PNx4eXmZX375xa5f165djZ+fn6lTp44pXLiwKVasWKpxbdq0yeTOnduEhoaaUaNGmenTp5vBgwebp5566obX9NdffxmLxWIaN25sxowZY6ZNm2Zat25tJJkuXbqk6D9gwAAjyfTo0cN88MEHpnnz5kaS+fTTT+36DR061Hh7e5vq1aubsmXLmtR+9Z8+fdqULl3a5MyZ0wwaNMhMmjTJNGjQwEgyc+fOvWH8memVV14xAQEBpnv37ub99983Y8aMMaVKlTKenp7mhx9+sOt79uxZU6ZMGRMUFGRGjx5txo0bZ4oUKWIKFy5sYmNj7foWK1bM5M2b1zz00EPGy8sr1X873377rfHw8DA1atQw7733nhkxYoQJDAw0ISEh5vjx4zeMf+LEicbf39906NDBTJo0yYwfP95Uq1bNSDIzZsyw65uUlGTq1KljsmfPbl5//XUzadIkExoaanLmzGn27NljF1N6v9+PP/64KVCggHnhhRfM9OnTzYgRI0xwcLDJnj27+euvv1K8foECBczgwYNveF0AQA7qfOSgt5fbMQc9cuSICQwMNMHBwebNN98048ePN1WqVDFeXl7mp59+uuE1ZUUOevbsWSPJ3HvvveaNN94wH3zwgenWrZvx8PAwjRo1svv3mJG8KavdSTmcI67+LI0xRpJ58MEHU/x3Ydu2bSlev1atWqZz5843vC4gKzAgepfo3bt3qoMc1zp//rwTokm/1JLR2xHJaOY5d+7cLZ+jcuXK5sknn7RrO3DggDl06JBdW3Jysrn//vuNr6+v3ev+8ccfRpIZO3asrS0hIcGUKlXKhIeH253j6NGj5tKlS8YYY5o3b55qMpqUlGTCwsJM7dq1zYULFzJ8TSdOnHCYKHTr1s1IMnv37rW1/fPPP8bb29v07t3b7lrr169vChcubK5cuWJrj46OtsWT1u+KMWPGGElm5cqVdtdUs2ZNU6BAAZOYmJjha7pZ69evN2fPnrVri42NNfnz5zd169a1ax89erSRZNatW2dr27lzp/H09DRRUVF2fQ8dOmRL2rJnz57qv53Q0FBTunRpu2vevHmz8fDwMJGRkTeMf9u2bebEiRN2bRcvXjTly5c3hQsXtmtfsGCBkWQ+//xzW9vx48dNQECA6dChg60tI9/v3377LcXntWfPHuPr62s6deqUIt6IiAhTrFixFAktANwIOWjWIwfNPHdrDtqrVy/j5eVldu3aZWs7f/68KVKkiKlWrdoNrykrctDExETz22+/pTjnsGHDjCS7Ce6M5E1Z7U7L4a7n6s/SGOuA6LXnTMvbb79tsmfPniLvB5yBAdG7hKNktGHDhqZixYpm/fr1pn79+sbf39/06dPHGGPM119/bR5++GFTsGBB4+PjY0qWLGmGDx9uN4hy7Tm2b99uGjVqZPz9/U2hQoXM6NGjU8QwYcIEExoaavz9/U1AQICpXr26XVXZzJkzjSRz8OBBY4w1iZJkt11NTFetWmUkmVWrVtm9xmeffWaqVatm/Pz8TL58+UynTp3MP//8Y9ena9euJnv27Oaff/4xjz76qMmePbsJDAw0r7zySorrcyQ5OdmMGDHChISEGH9/f9OoUSOzbds2h0nfqVOnTJ8+fUzhwoWNj4+PKVWqlBk1alSKqoJPP/3UVKtWzeTIkcPkzJnThIWFmfHjx98wlqSkJDN+/HgTFhZmfH19TWBgoGnatKn5888/bX2uj2vo0KEO/zC5/v03xpg///zTNGnSxOTLl8/4+fmZ4sWLm27duhljjDl48GCKz+f6xHTnzp3m8ccfN3ny5DG+vr6mevXq5ptvvnH4uqtXrzY9e/Y0+fPnNwEBAcYYY86cOWP69OljihUrZnx8fEz+/PlN48aNzYYNG9J8Xw4cOGAkmVmzZt3wPTTG+t2UZLZu3Wpr69evn/H09DSnT5+26/vWW28ZSebIkSMOz5VWMrpkyRIjySxevNgYY01E0/Odu5FFixYZSWbRokW2tsmTJxtJZvv27XZ9582bZySlqDC4Kq0/XFu0aGHy58+fon3s2LFGklm+fPktXEXmeOyxx0zevHnt2mrWrGlq1qyZom+TJk1MqVKlUj1XagOicXFxRpLp169fin0VK1Y0hQoVynjg/xcZGWkkmTNnztjannjiCRMcHJzi98azzz5rsmXLZi5evJjmOR19v1NTrVo1h38YffPNN0aS2bhxYzqvBACsyEH/Qw5KDno9Z+WglSpVcpgLXf33eW21YkZkZg561datW40kM2HChBu+vqO8yVVu1xwuvZz5WV4dEL1w4YJdJbUjW7ZsMZLMl19+mcErAm4da4je5eLi4tSsWTNVrVpV48eP13333SfJurh8jhw5FBkZqffee0/Vq1fXkCFDNGDAgBTnOHXqlB566CFVqVJF77zzjsqXL6/+/ftryZIltj7Tp0/Xiy++qNDQUI0fP17Dhg1T1apV9ccff6Qa2/jx41W4cGGVL19ec+bM0Zw5c9JcZ3HWrFlq27atPD09NXLkSPXo0UNffvml6tWrp/j4eLu+SUlJatq0qfLly6e
"text/plain": [
"<Figure size 1400x600 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, axes = plt.subplots(1, 2, figsize=(14,6), sharey=True)\n",
"\n",
"# --- Heatmap 1 : 2016 → 2019 ---\n",
"sns.heatmap(\n",
" transition_pct1,\n",
" annot=True,\n",
" fmt=\".2f\",\n",
" cmap=\"Blues\",\n",
" ax=axes[0]\n",
")\n",
"\n",
"axes[0].set_xlabel(\"Cluster 2019-2022\")\n",
"axes[0].set_ylabel(\"Cluster 2016-2019\")\n",
"axes[0].set_title(\"Transition des clusters (2016-2019 → 2019-2022)\")\n",
"\n",
"# --- Heatmap 2 : 2019 → 2022 ---\n",
"sns.heatmap(\n",
" transition_pct2,\n",
" annot=True,\n",
" fmt=\".2f\",\n",
" cmap=\"Blues\",\n",
" ax=axes[1]\n",
")\n",
"\n",
"axes[1].set_xlabel(\"Cluster 2022-2025\")\n",
"axes[1].set_ylabel(\"Cluster 2019-2022\")\n",
"axes[1].set_title(\"Transition des clusters (2019-2022 → 2022-2025)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2701546-abaf-42e2-9070-7ef43824096c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}