Project_Carmignac/Clustering_2Feb (1).ipynb

3783 lines
497 KiB
Plaintext
Raw Permalink Normal View History

2026-03-11 11:21:52 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "ee3b8d29-20bd-4c03-b366-0d16c21054c8",
"metadata": {},
"source": [
"# Clustering"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1161613a-7665-407a-9d2e-956947114767",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ae8183d7-1ba0-443c-a5cc-14f43df82b42",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_9087/3355046467.py:2: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df_flows = pd.read_csv(\"flows.csv\")\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Agreement - Code</th>\n",
" <th>Company - Id</th>\n",
" <th>Company - Ultimate Parent Id</th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Registrar Account - Region</th>\n",
" <th>RegistrarAccount - Country</th>\n",
" <th>Product - Asset Type</th>\n",
" <th>Product - Strategy</th>\n",
" <th>Product - Legal Status</th>\n",
" <th>...</th>\n",
" <th>Centralisation Date</th>\n",
" <th>Quantity - Subscription</th>\n",
" <th>Quantity - Redemption</th>\n",
" <th>Quantity - NetFlows</th>\n",
" <th>Value Ccy - Subscription</th>\n",
" <th>Value Ccy - Redemption</th>\n",
" <th>Value Ccy - NetFlows</th>\n",
" <th>Value € - Subscription</th>\n",
" <th>Value € - Redemption</th>\n",
" <th>Value € - NetFlows</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>200127202</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>EQUITY</td>\n",
" <td>INVESTISSEMENT</td>\n",
" <td>SICAV</td>\n",
" <td>...</td>\n",
" <td>2020-11-05</td>\n",
" <td>1636.000</td>\n",
" <td>0.000</td>\n",
" <td>1636.000</td>\n",
" <td>280983.00</td>\n",
" <td>0.00</td>\n",
" <td>280983.00</td>\n",
" <td>280983.00</td>\n",
" <td>0.00</td>\n",
" <td>280983.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2015-03-09</td>\n",
" <td>144.690</td>\n",
" <td>0.000</td>\n",
" <td>144.690</td>\n",
" <td>99985.13</td>\n",
" <td>0.00</td>\n",
" <td>99985.13</td>\n",
" <td>99985.13</td>\n",
" <td>0.00</td>\n",
" <td>99985.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>EQUITY</td>\n",
" <td>INVESTISSEMENT</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2016-10-26</td>\n",
" <td>0.000</td>\n",
" <td>-8.321</td>\n",
" <td>-8.321</td>\n",
" <td>0.00</td>\n",
" <td>-9384.76</td>\n",
" <td>-9384.76</td>\n",
" <td>0.00</td>\n",
" <td>-9384.76</td>\n",
" <td>-9384.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>EQUITY</td>\n",
" <td>INVESTISSEMENT</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2018-10-18</td>\n",
" <td>0.000</td>\n",
" <td>-22.083</td>\n",
" <td>-22.083</td>\n",
" <td>0.00</td>\n",
" <td>-25227.40</td>\n",
" <td>-25227.40</td>\n",
" <td>0.00</td>\n",
" <td>-25227.40</td>\n",
" <td>-25227.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>003</td>\n",
" <td>166</td>\n",
" <td>166</td>\n",
" <td>406533</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>EQUITY</td>\n",
" <td>INVESTISSEMENT</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2019-04-08</td>\n",
" <td>0.000</td>\n",
" <td>-465.992</td>\n",
" <td>-465.992</td>\n",
" <td>0.00</td>\n",
" <td>-563775.76</td>\n",
" <td>-563775.76</td>\n",
" <td>0.00</td>\n",
" <td>-563775.76</td>\n",
" <td>-563775.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574456</th>\n",
" <td>2574456</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2015-06-12</td>\n",
" <td>0.000</td>\n",
" <td>-20.000</td>\n",
" <td>-20.000</td>\n",
" <td>0.00</td>\n",
" <td>-34294.40</td>\n",
" <td>-34294.40</td>\n",
" <td>0.00</td>\n",
" <td>-34294.40</td>\n",
" <td>-34294.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574457</th>\n",
" <td>2574457</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2015-09-18</td>\n",
" <td>328.726</td>\n",
" <td>0.000</td>\n",
" <td>328.726</td>\n",
" <td>564028.07</td>\n",
" <td>0.00</td>\n",
" <td>564028.07</td>\n",
" <td>564028.07</td>\n",
" <td>0.00</td>\n",
" <td>564028.07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574458</th>\n",
" <td>2574458</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2015-09-25</td>\n",
" <td>4.443</td>\n",
" <td>0.000</td>\n",
" <td>4.443</td>\n",
" <td>7603.66</td>\n",
" <td>0.00</td>\n",
" <td>7603.66</td>\n",
" <td>7603.66</td>\n",
" <td>0.00</td>\n",
" <td>7603.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574459</th>\n",
" <td>2574459</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>FCP</td>\n",
" <td>...</td>\n",
" <td>2015-11-09</td>\n",
" <td>0.000</td>\n",
" <td>-440.000</td>\n",
" <td>-440.000</td>\n",
" <td>0.00</td>\n",
" <td>-754696.80</td>\n",
" <td>-754696.80</td>\n",
" <td>0.00</td>\n",
" <td>-754696.80</td>\n",
" <td>-754696.80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2574460</th>\n",
" <td>2574460</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>LUXEMBOURG</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>SICAV</td>\n",
" <td>...</td>\n",
" <td>2016-01-11</td>\n",
" <td>3595.000</td>\n",
" <td>0.000</td>\n",
" <td>3595.000</td>\n",
" <td>358385.55</td>\n",
" <td>0.00</td>\n",
" <td>358385.55</td>\n",
" <td>358385.55</td>\n",
" <td>0.00</td>\n",
" <td>358385.55</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2574461 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Agreement - Code Company - Id \\\n",
"0 0 003 166 \n",
"1 1 003 166 \n",
"2 2 003 166 \n",
"3 3 003 166 \n",
"4 4 003 166 \n",
"... ... ... ... \n",
"2574456 2574456 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574457 2574457 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574458 2574458 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574459 2574459 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574460 2574460 PRIVATE CLIENT PRIVATE CLIENT \n",
"\n",
" Company - Ultimate Parent Id Registrar Account - ID \\\n",
"0 166 200127202 \n",
"1 166 406533 \n",
"2 166 406533 \n",
"3 166 406533 \n",
"4 166 406533 \n",
"... ... ... \n",
"2574456 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574457 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574458 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574459 PRIVATE CLIENT PRIVATE CLIENT \n",
"2574460 PRIVATE CLIENT PRIVATE CLIENT \n",
"\n",
" Registrar Account - Region RegistrarAccount - Country \\\n",
"0 FRANCE FRANCE \n",
"1 FRANCE FRANCE \n",
"2 FRANCE FRANCE \n",
"3 FRANCE FRANCE \n",
"4 FRANCE FRANCE \n",
"... ... ... \n",
"2574456 LUXEMBOURG LUXEMBOURG \n",
"2574457 LUXEMBOURG LUXEMBOURG \n",
"2574458 LUXEMBOURG LUXEMBOURG \n",
"2574459 LUXEMBOURG LUXEMBOURG \n",
"2574460 LUXEMBOURG LUXEMBOURG \n",
"\n",
" Product - Asset Type Product - Strategy Product - Legal Status ... \\\n",
"0 EQUITY INVESTISSEMENT SICAV ... \n",
"1 DIVERSIFIED PATRIMOINE FCP ... \n",
"2 EQUITY INVESTISSEMENT FCP ... \n",
"3 EQUITY INVESTISSEMENT FCP ... \n",
"4 EQUITY INVESTISSEMENT FCP ... \n",
"... ... ... ... ... \n",
"2574456 FIXED INCOME SÉCURITÉ FCP ... \n",
"2574457 FIXED INCOME SÉCURITÉ FCP ... \n",
"2574458 FIXED INCOME SÉCURITÉ FCP ... \n",
"2574459 FIXED INCOME SÉCURITÉ FCP ... \n",
"2574460 FIXED INCOME SÉCURITÉ SICAV ... \n",
"\n",
" Centralisation Date Quantity - Subscription Quantity - Redemption \\\n",
"0 2020-11-05 1636.000 0.000 \n",
"1 2015-03-09 144.690 0.000 \n",
"2 2016-10-26 0.000 -8.321 \n",
"3 2018-10-18 0.000 -22.083 \n",
"4 2019-04-08 0.000 -465.992 \n",
"... ... ... ... \n",
"2574456 2015-06-12 0.000 -20.000 \n",
"2574457 2015-09-18 328.726 0.000 \n",
"2574458 2015-09-25 4.443 0.000 \n",
"2574459 2015-11-09 0.000 -440.000 \n",
"2574460 2016-01-11 3595.000 0.000 \n",
"\n",
" Quantity - NetFlows Value Ccy - Subscription Value Ccy - Redemption \\\n",
"0 1636.000 280983.00 0.00 \n",
"1 144.690 99985.13 0.00 \n",
"2 -8.321 0.00 -9384.76 \n",
"3 -22.083 0.00 -25227.40 \n",
"4 -465.992 0.00 -563775.76 \n",
"... ... ... ... \n",
"2574456 -20.000 0.00 -34294.40 \n",
"2574457 328.726 564028.07 0.00 \n",
"2574458 4.443 7603.66 0.00 \n",
"2574459 -440.000 0.00 -754696.80 \n",
"2574460 3595.000 358385.55 0.00 \n",
"\n",
" Value Ccy - NetFlows Value € - Subscription Value € - Redemption \\\n",
"0 280983.00 280983.00 0.00 \n",
"1 99985.13 99985.13 0.00 \n",
"2 -9384.76 0.00 -9384.76 \n",
"3 -25227.40 0.00 -25227.40 \n",
"4 -563775.76 0.00 -563775.76 \n",
"... ... ... ... \n",
"2574456 -34294.40 0.00 -34294.40 \n",
"2574457 564028.07 564028.07 0.00 \n",
"2574458 7603.66 7603.66 0.00 \n",
"2574459 -754696.80 0.00 -754696.80 \n",
"2574460 358385.55 358385.55 0.00 \n",
"\n",
" Value € - NetFlows \n",
"0 280983.00 \n",
"1 99985.13 \n",
"2 -9384.76 \n",
"3 -25227.40 \n",
"4 -563775.76 \n",
"... ... \n",
"2574456 -34294.40 \n",
"2574457 564028.07 \n",
"2574458 7603.66 \n",
"2574459 -754696.80 \n",
"2574460 358385.55 \n",
"\n",
"[2574461 rows x 25 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd \n",
"df_flows = pd.read_csv(\"flows.csv\")\n",
"df_flows"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0a614aff-b7e1-4032-94d1-cb87a3cd8806",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_9087/223063792.py:1: DtypeWarning: Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df_aum = pd.read_csv(\"AUM_repaired.csv\")\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0.1</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Agreement - Code</th>\n",
" <th>Company - Id</th>\n",
" <th>Company - Ultimate Parent Id</th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Registrar Account - Region</th>\n",
" <th>RegistrarAccount - Country</th>\n",
" <th>Product - Asset Type</th>\n",
" <th>Product - Strategy</th>\n",
" <th>...</th>\n",
" <th>Product - Is Dedie ?</th>\n",
" <th>Product - Fund</th>\n",
" <th>Product - Shareclass Type</th>\n",
" <th>Product - Shareclass Currency</th>\n",
" <th>Product - Isin</th>\n",
" <th>Centralisation Date</th>\n",
" <th>Quantity - AUM</th>\n",
" <th>Value - AUM CCY</th>\n",
" <th>Value - AUM €</th>\n",
" <th>repair_flag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-03-31</td>\n",
" <td>35.368</td>\n",
" <td>24648.6666</td>\n",
" <td>24648.6666</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22413.0553</td>\n",
" <td>22413.0553</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-12-31</td>\n",
" <td>35.368</td>\n",
" <td>22051.2406</td>\n",
" <td>22051.2406</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-03-31</td>\n",
" <td>35.368</td>\n",
" <td>21626.1173</td>\n",
" <td>21626.1173</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22489.4502</td>\n",
" <td>22489.4502</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5516128</th>\n",
" <td>5516128</td>\n",
" <td>4880294</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PORTFOLIO SÉCURITÉ</td>\n",
" <td>AW &amp; AW-R</td>\n",
" <td>EUR</td>\n",
" <td>LU1299306321</td>\n",
" <td>2020-10-31</td>\n",
" <td>3099.000</td>\n",
" <td>318422.2500</td>\n",
" <td>318422.2500</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5516129</th>\n",
" <td>5516129</td>\n",
" <td>4880294</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PORTFOLIO SÉCURITÉ</td>\n",
" <td>AW &amp; AW-R</td>\n",
" <td>EUR</td>\n",
" <td>LU1299306321</td>\n",
" <td>2020-10-31</td>\n",
" <td>3099.000</td>\n",
" <td>318422.2500</td>\n",
" <td>318422.2500</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5516130</th>\n",
" <td>5516130</td>\n",
" <td>4880295</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PORTFOLIO SÉCURITÉ</td>\n",
" <td>AW &amp; AW-R</td>\n",
" <td>EUR</td>\n",
" <td>LU1299306321</td>\n",
" <td>2021-07-31</td>\n",
" <td>2835.000</td>\n",
" <td>297618.3000</td>\n",
" <td>297618.3000</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5516131</th>\n",
" <td>5516131</td>\n",
" <td>4880295</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PORTFOLIO SÉCURITÉ</td>\n",
" <td>AW &amp; AW-R</td>\n",
" <td>EUR</td>\n",
" <td>LU1299306321</td>\n",
" <td>2021-07-31</td>\n",
" <td>2835.000</td>\n",
" <td>297618.3000</td>\n",
" <td>297618.3000</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5516132</th>\n",
" <td>5516132</td>\n",
" <td>4880296</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>SWITZERLAND</td>\n",
" <td>FIXED INCOME</td>\n",
" <td>SÉCURITÉ</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PORTFOLIO SÉCURITÉ</td>\n",
" <td>FW &amp; FW-R</td>\n",
" <td>EUR</td>\n",
" <td>LU1792391911</td>\n",
" <td>2020-07-31</td>\n",
" <td>2916.394</td>\n",
" <td>287410.6287</td>\n",
" <td>287410.6287</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5516133 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0.1 Unnamed: 0 Agreement - Code Company - Id \\\n",
"0 0 0 3 166.0 \n",
"1 1 1 3 166.0 \n",
"2 2 2 3 166.0 \n",
"3 3 3 3 166.0 \n",
"4 4 4 3 166.0 \n",
"... ... ... ... ... \n",
"5516128 5516128 4880294 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516129 5516129 4880294 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516130 5516130 4880295 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516131 5516131 4880295 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516132 5516132 4880296 PRIVATE CLIENT PRIVATE CLIENT \n",
"\n",
" Company - Ultimate Parent Id Registrar Account - ID \\\n",
"0 166.0 200000647 \n",
"1 166.0 200000647 \n",
"2 166.0 200000647 \n",
"3 166.0 200000647 \n",
"4 166.0 200000647 \n",
"... ... ... \n",
"5516128 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516129 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516130 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516131 PRIVATE CLIENT PRIVATE CLIENT \n",
"5516132 PRIVATE CLIENT PRIVATE CLIENT \n",
"\n",
" Registrar Account - Region RegistrarAccount - Country \\\n",
"0 FRANCE FRANCE \n",
"1 FRANCE FRANCE \n",
"2 FRANCE FRANCE \n",
"3 FRANCE FRANCE \n",
"4 FRANCE FRANCE \n",
"... ... ... \n",
"5516128 SWITZERLAND SWITZERLAND \n",
"5516129 SWITZERLAND SWITZERLAND \n",
"5516130 SWITZERLAND SWITZERLAND \n",
"5516131 SWITZERLAND SWITZERLAND \n",
"5516132 SWITZERLAND SWITZERLAND \n",
"\n",
" Product - Asset Type Product - Strategy ... Product - Is Dedie ? \\\n",
"0 DIVERSIFIED PATRIMOINE ... NO \n",
"1 DIVERSIFIED PATRIMOINE ... NO \n",
"2 DIVERSIFIED PATRIMOINE ... NO \n",
"3 DIVERSIFIED PATRIMOINE ... NO \n",
"4 DIVERSIFIED PATRIMOINE ... NO \n",
"... ... ... ... ... \n",
"5516128 FIXED INCOME SÉCURITÉ ... NO \n",
"5516129 FIXED INCOME SÉCURITÉ ... NO \n",
"5516130 FIXED INCOME SÉCURITÉ ... NO \n",
"5516131 FIXED INCOME SÉCURITÉ ... NO \n",
"5516132 FIXED INCOME SÉCURITÉ ... NO \n",
"\n",
" Product - Fund Product - Shareclass Type \\\n",
"0 CARMIGNAC PATRIMOINE A \n",
"1 CARMIGNAC PATRIMOINE A \n",
"2 CARMIGNAC PATRIMOINE A \n",
"3 CARMIGNAC PATRIMOINE A \n",
"4 CARMIGNAC PATRIMOINE A \n",
"... ... ... \n",
"5516128 CARMIGNAC PORTFOLIO SÉCURITÉ AW & AW-R \n",
"5516129 CARMIGNAC PORTFOLIO SÉCURITÉ AW & AW-R \n",
"5516130 CARMIGNAC PORTFOLIO SÉCURITÉ AW & AW-R \n",
"5516131 CARMIGNAC PORTFOLIO SÉCURITÉ AW & AW-R \n",
"5516132 CARMIGNAC PORTFOLIO SÉCURITÉ FW & FW-R \n",
"\n",
" Product - Shareclass Currency Product - Isin Centralisation Date \\\n",
"0 EUR FR0010135103 2015-03-31 \n",
"1 EUR FR0010135103 2015-11-30 \n",
"2 EUR FR0010135103 2015-12-31 \n",
"3 EUR FR0010135103 2016-03-31 \n",
"4 EUR FR0010135103 2016-11-30 \n",
"... ... ... ... \n",
"5516128 EUR LU1299306321 2020-10-31 \n",
"5516129 EUR LU1299306321 2020-10-31 \n",
"5516130 EUR LU1299306321 2021-07-31 \n",
"5516131 EUR LU1299306321 2021-07-31 \n",
"5516132 EUR LU1792391911 2020-07-31 \n",
"\n",
" Quantity - AUM Value - AUM CCY Value - AUM € repair_flag \n",
"0 35.368 24648.6666 24648.6666 False \n",
"1 35.368 22413.0553 22413.0553 False \n",
"2 35.368 22051.2406 22051.2406 False \n",
"3 35.368 21626.1173 21626.1173 False \n",
"4 35.368 22489.4502 22489.4502 False \n",
"... ... ... ... ... \n",
"5516128 3099.000 318422.2500 318422.2500 False \n",
"5516129 3099.000 318422.2500 318422.2500 False \n",
"5516130 2835.000 297618.3000 297618.3000 False \n",
"5516131 2835.000 297618.3000 297618.3000 False \n",
"5516132 2916.394 287410.6287 287410.6287 False \n",
"\n",
"[5516133 rows x 21 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_aum = pd.read_csv(\"AUM_repaired.csv\")\n",
"df_aum"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c4fd5fc2-8977-4f22-bae3-5acb60561042",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_days</th>\n",
" <th>n_transactions</th>\n",
" <th>total_netflows</th>\n",
" <th>mean_flow</th>\n",
" <th>std_flow</th>\n",
" <th>total_subscription</th>\n",
" <th>total_redemption</th>\n",
" <th>churn_ratio</th>\n",
" <th>churn_flag</th>\n",
" <th>activity_score</th>\n",
" <th>flow_volatility</th>\n",
" <th>inertia_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100000028</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>-1.092380e+02</td>\n",
" <td>-36.412667</td>\n",
" <td>49.280511</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-1.092380e+02</td>\n",
" <td>-1.092380e+08</td>\n",
" <td>0</td>\n",
" <td>1.386294</td>\n",
" <td>49.280511</td>\n",
" <td>0.998921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100000042</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-6.601150e+02</td>\n",
" <td>-660.115000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-6.601150e+02</td>\n",
" <td>-6.601150e+08</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100000065</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-1.746460e+02</td>\n",
" <td>-174.646000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-1.746460e+02</td>\n",
" <td>-1.746460e+08</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100000069</td>\n",
" <td>65</td>\n",
" <td>73</td>\n",
" <td>-7.479755e+03</td>\n",
" <td>-102.462397</td>\n",
" <td>2168.971331</td>\n",
" <td>3.332040e+04</td>\n",
" <td>-4.080016e+04</td>\n",
" <td>-1.224480e+00</td>\n",
" <td>0</td>\n",
" <td>4.304065</td>\n",
" <td>2168.971331</td>\n",
" <td>0.976619</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100000073</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-1.334020e+02</td>\n",
" <td>-133.402000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-1.334020e+02</td>\n",
" <td>-1.334020e+08</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6837</th>\n",
" <td>422905</td>\n",
" <td>208</td>\n",
" <td>212</td>\n",
" <td>-4.329218e+03</td>\n",
" <td>-20.420840</td>\n",
" <td>331.677297</td>\n",
" <td>9.699140e+03</td>\n",
" <td>-1.402836e+04</td>\n",
" <td>-1.446351e+00</td>\n",
" <td>0</td>\n",
" <td>5.361292</td>\n",
" <td>331.677297</td>\n",
" <td>0.925180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6838</th>\n",
" <td>422906</td>\n",
" <td>1146</td>\n",
" <td>1556</td>\n",
" <td>4.455099e+03</td>\n",
" <td>2.863174</td>\n",
" <td>201.071555</td>\n",
" <td>6.078686e+04</td>\n",
" <td>-5.633177e+04</td>\n",
" <td>-9.267095e-01</td>\n",
" <td>0</td>\n",
" <td>7.350516</td>\n",
" <td>201.071555</td>\n",
" <td>0.587770</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6839</th>\n",
" <td>8307</td>\n",
" <td>204</td>\n",
" <td>221</td>\n",
" <td>2.168303e+04</td>\n",
" <td>98.113249</td>\n",
" <td>2217.940406</td>\n",
" <td>1.204399e+05</td>\n",
" <td>-9.875688e+04</td>\n",
" <td>-8.199681e-01</td>\n",
" <td>0</td>\n",
" <td>5.402677</td>\n",
" <td>2217.940406</td>\n",
" <td>0.926619</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6840</th>\n",
" <td>OFF DISTRIBUTION</td>\n",
" <td>2656</td>\n",
" <td>27679</td>\n",
" <td>1.319043e+08</td>\n",
" <td>4765.499704</td>\n",
" <td>391347.475503</td>\n",
" <td>3.388942e+08</td>\n",
" <td>-2.069900e+08</td>\n",
" <td>-6.107804e-01</td>\n",
" <td>0</td>\n",
" <td>10.228465</td>\n",
" <td>391347.475503</td>\n",
" <td>0.044604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6841</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>2690</td>\n",
" <td>32363</td>\n",
" <td>-4.181221e+05</td>\n",
" <td>-12.919758</td>\n",
" <td>7572.830139</td>\n",
" <td>1.354277e+07</td>\n",
" <td>-1.396089e+07</td>\n",
" <td>-1.030874e+00</td>\n",
" <td>0</td>\n",
" <td>10.384802</td>\n",
" <td>7572.830139</td>\n",
" <td>0.032374</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6842 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_days n_transactions total_netflows \\\n",
"0 100000028 3 3 -1.092380e+02 \n",
"1 100000042 1 1 -6.601150e+02 \n",
"2 100000065 1 1 -1.746460e+02 \n",
"3 100000069 65 73 -7.479755e+03 \n",
"4 100000073 1 1 -1.334020e+02 \n",
"... ... ... ... ... \n",
"6837 422905 208 212 -4.329218e+03 \n",
"6838 422906 1146 1556 4.455099e+03 \n",
"6839 8307 204 221 2.168303e+04 \n",
"6840 OFF DISTRIBUTION 2656 27679 1.319043e+08 \n",
"6841 PRIVATE CLIENT 2690 32363 -4.181221e+05 \n",
"\n",
" mean_flow std_flow total_subscription total_redemption \\\n",
"0 -36.412667 49.280511 0.000000e+00 -1.092380e+02 \n",
"1 -660.115000 NaN 0.000000e+00 -6.601150e+02 \n",
"2 -174.646000 NaN 0.000000e+00 -1.746460e+02 \n",
"3 -102.462397 2168.971331 3.332040e+04 -4.080016e+04 \n",
"4 -133.402000 NaN 0.000000e+00 -1.334020e+02 \n",
"... ... ... ... ... \n",
"6837 -20.420840 331.677297 9.699140e+03 -1.402836e+04 \n",
"6838 2.863174 201.071555 6.078686e+04 -5.633177e+04 \n",
"6839 98.113249 2217.940406 1.204399e+05 -9.875688e+04 \n",
"6840 4765.499704 391347.475503 3.388942e+08 -2.069900e+08 \n",
"6841 -12.919758 7572.830139 1.354277e+07 -1.396089e+07 \n",
"\n",
" churn_ratio churn_flag activity_score flow_volatility inertia_ratio \n",
"0 -1.092380e+08 0 1.386294 49.280511 0.998921 \n",
"1 -6.601150e+08 0 0.693147 0.000000 0.999640 \n",
"2 -1.746460e+08 0 0.693147 0.000000 0.999640 \n",
"3 -1.224480e+00 0 4.304065 2168.971331 0.976619 \n",
"4 -1.334020e+08 0 0.693147 0.000000 0.999640 \n",
"... ... ... ... ... ... \n",
"6837 -1.446351e+00 0 5.361292 331.677297 0.925180 \n",
"6838 -9.267095e-01 0 7.350516 201.071555 0.587770 \n",
"6839 -8.199681e-01 0 5.402677 2217.940406 0.926619 \n",
"6840 -6.107804e-01 0 10.228465 391347.475503 0.044604 \n",
"6841 -1.030874e+00 0 10.384802 7572.830139 0.032374 \n",
"\n",
"[6842 rows x 13 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_client = pd.read_csv(\"client_behavior_clean.csv\")\n",
"df_client"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2233e16-958e-4973-aed4-6c9f2cbe8397",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Flows - AUM: (2574461, 25) (5516133, 21)\n"
]
}
],
"source": [
"print (\"Flows - AUM:\", df_flows.shape , df_aum.shape)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9bca9076-a961-4004-9b5c-7b55191e2c6e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Flows - AUM - Client shape: (2574461, 25) (5516133, 21) (6842, 13)\n",
"Flows Columns: Index(['Unnamed: 0', 'Agreement - Code', 'Company - Id',\n",
" 'Company - Ultimate Parent Id', 'Registrar Account - ID',\n",
" 'Registrar Account - Region', 'RegistrarAccount - Country',\n",
" 'Product - Asset Type', 'Product - Strategy', 'Product - Legal Status',\n",
" 'Product - Is Dedie ?', 'Product - Fund', 'Product - Shareclass Type',\n",
" 'Product - Shareclass Currency', 'Product - Isin',\n",
" 'Centralisation Date', 'Quantity - Subscription',\n",
" 'Quantity - Redemption', 'Quantity - NetFlows',\n",
" 'Value Ccy - Subscription', 'Value Ccy - Redemption',\n",
" 'Value Ccy - NetFlows', 'Value € - Subscription',\n",
" 'Value € - Redemption', 'Value € - NetFlows'],\n",
" dtype='object')\n",
"AUM Columns: Index(['Unnamed: 0.1', 'Unnamed: 0', 'Agreement - Code', 'Company - Id',\n",
" 'Company - Ultimate Parent Id', 'Registrar Account - ID',\n",
" 'Registrar Account - Region', 'RegistrarAccount - Country',\n",
" 'Product - Asset Type', 'Product - Strategy', 'Product - Legal Status',\n",
" 'Product - Is Dedie ?', 'Product - Fund', 'Product - Shareclass Type',\n",
" 'Product - Shareclass Currency', 'Product - Isin',\n",
" 'Centralisation Date', 'Quantity - AUM', 'Value - AUM CCY',\n",
" 'Value - AUM €', 'repair_flag'],\n",
" dtype='object')\n",
"Client Columns: Index(['Registrar Account - ID', 'n_days', 'n_transactions', 'total_netflows',\n",
" 'mean_flow', 'std_flow', 'total_subscription', 'total_redemption',\n",
" 'churn_ratio', 'churn_flag', 'activity_score', 'flow_volatility',\n",
" 'inertia_ratio'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print (\"Flows - AUM - Client shape:\", df_flows.shape , df_aum.shape, df_client.shape)\n",
"print (\"Flows Columns:\" , df_flows.columns )\n",
"print (\"AUM Columns:\" , df_aum.columns )\n",
"print (\"Client Columns:\" , df_client.columns )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c2a1a52f-b0b9-4ecd-bf9f-2f9759e0cd24",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0.1</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Agreement - Code</th>\n",
" <th>Company - Id</th>\n",
" <th>Company - Ultimate Parent Id</th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Registrar Account - Region</th>\n",
" <th>RegistrarAccount - Country</th>\n",
" <th>Product - Asset Type</th>\n",
" <th>Product - Strategy</th>\n",
" <th>...</th>\n",
" <th>Product - Is Dedie ?</th>\n",
" <th>Product - Fund</th>\n",
" <th>Product - Shareclass Type</th>\n",
" <th>Product - Shareclass Currency</th>\n",
" <th>Product - Isin</th>\n",
" <th>Centralisation Date</th>\n",
" <th>Quantity - AUM</th>\n",
" <th>Value - AUM CCY</th>\n",
" <th>Value - AUM €</th>\n",
" <th>repair_flag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-03-31</td>\n",
" <td>35.368</td>\n",
" <td>24648.6666</td>\n",
" <td>24648.6666</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22413.0553</td>\n",
" <td>22413.0553</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2015-12-31</td>\n",
" <td>35.368</td>\n",
" <td>22051.2406</td>\n",
" <td>22051.2406</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-03-31</td>\n",
" <td>35.368</td>\n",
" <td>21626.1173</td>\n",
" <td>21626.1173</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2016-11-30</td>\n",
" <td>35.368</td>\n",
" <td>22489.4502</td>\n",
" <td>22489.4502</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2017-06-30</td>\n",
" <td>35.368</td>\n",
" <td>23225.4582</td>\n",
" <td>23225.4582</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2017-08-31</td>\n",
" <td>35.368</td>\n",
" <td>22964.0887</td>\n",
" <td>22964.0887</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2018-03-31</td>\n",
" <td>35.368</td>\n",
" <td>22692.4625</td>\n",
" <td>22692.4625</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2018-05-31</td>\n",
" <td>35.368</td>\n",
" <td>22749.7586</td>\n",
" <td>22749.7586</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>166.0</td>\n",
" <td>166.0</td>\n",
" <td>200000647</td>\n",
" <td>FRANCE</td>\n",
" <td>FRANCE</td>\n",
" <td>DIVERSIFIED</td>\n",
" <td>PATRIMOINE</td>\n",
" <td>...</td>\n",
" <td>NO</td>\n",
" <td>CARMIGNAC PATRIMOINE</td>\n",
" <td>A</td>\n",
" <td>EUR</td>\n",
" <td>FR0010135103</td>\n",
" <td>2019-02-28</td>\n",
" <td>35.368</td>\n",
" <td>20875.9620</td>\n",
" <td>20875.9620</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0.1 Unnamed: 0 Agreement - Code Company - Id \\\n",
"0 0 0 3 166.0 \n",
"1 1 1 3 166.0 \n",
"2 2 2 3 166.0 \n",
"3 3 3 3 166.0 \n",
"4 4 4 3 166.0 \n",
"5 5 5 3 166.0 \n",
"6 6 6 3 166.0 \n",
"7 7 7 3 166.0 \n",
"8 8 8 3 166.0 \n",
"9 9 9 3 166.0 \n",
"\n",
" Company - Ultimate Parent Id Registrar Account - ID \\\n",
"0 166.0 200000647 \n",
"1 166.0 200000647 \n",
"2 166.0 200000647 \n",
"3 166.0 200000647 \n",
"4 166.0 200000647 \n",
"5 166.0 200000647 \n",
"6 166.0 200000647 \n",
"7 166.0 200000647 \n",
"8 166.0 200000647 \n",
"9 166.0 200000647 \n",
"\n",
" Registrar Account - Region RegistrarAccount - Country Product - Asset Type \\\n",
"0 FRANCE FRANCE DIVERSIFIED \n",
"1 FRANCE FRANCE DIVERSIFIED \n",
"2 FRANCE FRANCE DIVERSIFIED \n",
"3 FRANCE FRANCE DIVERSIFIED \n",
"4 FRANCE FRANCE DIVERSIFIED \n",
"5 FRANCE FRANCE DIVERSIFIED \n",
"6 FRANCE FRANCE DIVERSIFIED \n",
"7 FRANCE FRANCE DIVERSIFIED \n",
"8 FRANCE FRANCE DIVERSIFIED \n",
"9 FRANCE FRANCE DIVERSIFIED \n",
"\n",
" Product - Strategy ... Product - Is Dedie ? Product - Fund \\\n",
"0 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"1 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"2 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"3 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"4 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"5 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"6 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"7 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"8 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"9 PATRIMOINE ... NO CARMIGNAC PATRIMOINE \n",
"\n",
" Product - Shareclass Type Product - Shareclass Currency Product - Isin \\\n",
"0 A EUR FR0010135103 \n",
"1 A EUR FR0010135103 \n",
"2 A EUR FR0010135103 \n",
"3 A EUR FR0010135103 \n",
"4 A EUR FR0010135103 \n",
"5 A EUR FR0010135103 \n",
"6 A EUR FR0010135103 \n",
"7 A EUR FR0010135103 \n",
"8 A EUR FR0010135103 \n",
"9 A EUR FR0010135103 \n",
"\n",
" Centralisation Date Quantity - AUM Value - AUM CCY Value - AUM € \\\n",
"0 2015-03-31 35.368 24648.6666 24648.6666 \n",
"1 2015-11-30 35.368 22413.0553 22413.0553 \n",
"2 2015-12-31 35.368 22051.2406 22051.2406 \n",
"3 2016-03-31 35.368 21626.1173 21626.1173 \n",
"4 2016-11-30 35.368 22489.4502 22489.4502 \n",
"5 2017-06-30 35.368 23225.4582 23225.4582 \n",
"6 2017-08-31 35.368 22964.0887 22964.0887 \n",
"7 2018-03-31 35.368 22692.4625 22692.4625 \n",
"8 2018-05-31 35.368 22749.7586 22749.7586 \n",
"9 2019-02-28 35.368 20875.9620 20875.9620 \n",
"\n",
" repair_flag \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"5 False \n",
"6 False \n",
"7 False \n",
"8 False \n",
"9 False \n",
"\n",
"[10 rows x 21 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_aum.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fc435518-e8a1-4ad5-bb72-e482e3cc6d75",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_days</th>\n",
" <th>n_transactions</th>\n",
" <th>total_netflows</th>\n",
" <th>mean_flow</th>\n",
" <th>std_flow</th>\n",
" <th>total_subscription</th>\n",
" <th>total_redemption</th>\n",
" <th>churn_ratio</th>\n",
" <th>churn_flag</th>\n",
" <th>activity_score</th>\n",
" <th>flow_volatility</th>\n",
" <th>inertia_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6832</th>\n",
" <td>422876</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-1.229300e+01</td>\n",
" <td>-12.293000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-1.229300e+01</td>\n",
" <td>-1.229300e+07</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6833</th>\n",
" <td>422877</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>-5.860000e+02</td>\n",
" <td>-58.600000</td>\n",
" <td>34788.810510</td>\n",
" <td>1.096470e+05</td>\n",
" <td>-1.102330e+05</td>\n",
" <td>-1.005344e+00</td>\n",
" <td>0</td>\n",
" <td>2.397895</td>\n",
" <td>34788.810510</td>\n",
" <td>0.996763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6834</th>\n",
" <td>422884</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>1.807000e+04</td>\n",
" <td>2258.750000</td>\n",
" <td>7296.392333</td>\n",
" <td>2.044500e+04</td>\n",
" <td>-2.375000e+03</td>\n",
" <td>-1.161653e-01</td>\n",
" <td>0</td>\n",
" <td>2.197225</td>\n",
" <td>7296.392333</td>\n",
" <td>0.997122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6835</th>\n",
" <td>422891</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-5.358000e+01</td>\n",
" <td>-53.580000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-5.358000e+01</td>\n",
" <td>-5.358000e+07</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6836</th>\n",
" <td>422895</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-1.811520e+02</td>\n",
" <td>-181.152000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>-1.811520e+02</td>\n",
" <td>-1.811520e+08</td>\n",
" <td>0</td>\n",
" <td>0.693147</td>\n",
" <td>0.000000</td>\n",
" <td>0.999640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6837</th>\n",
" <td>422905</td>\n",
" <td>208</td>\n",
" <td>212</td>\n",
" <td>-4.329218e+03</td>\n",
" <td>-20.420840</td>\n",
" <td>331.677297</td>\n",
" <td>9.699140e+03</td>\n",
" <td>-1.402836e+04</td>\n",
" <td>-1.446351e+00</td>\n",
" <td>0</td>\n",
" <td>5.361292</td>\n",
" <td>331.677297</td>\n",
" <td>0.925180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6838</th>\n",
" <td>422906</td>\n",
" <td>1146</td>\n",
" <td>1556</td>\n",
" <td>4.455099e+03</td>\n",
" <td>2.863174</td>\n",
" <td>201.071555</td>\n",
" <td>6.078686e+04</td>\n",
" <td>-5.633177e+04</td>\n",
" <td>-9.267095e-01</td>\n",
" <td>0</td>\n",
" <td>7.350516</td>\n",
" <td>201.071555</td>\n",
" <td>0.587770</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6839</th>\n",
" <td>8307</td>\n",
" <td>204</td>\n",
" <td>221</td>\n",
" <td>2.168303e+04</td>\n",
" <td>98.113249</td>\n",
" <td>2217.940406</td>\n",
" <td>1.204399e+05</td>\n",
" <td>-9.875688e+04</td>\n",
" <td>-8.199681e-01</td>\n",
" <td>0</td>\n",
" <td>5.402677</td>\n",
" <td>2217.940406</td>\n",
" <td>0.926619</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6840</th>\n",
" <td>OFF DISTRIBUTION</td>\n",
" <td>2656</td>\n",
" <td>27679</td>\n",
" <td>1.319043e+08</td>\n",
" <td>4765.499704</td>\n",
" <td>391347.475503</td>\n",
" <td>3.388942e+08</td>\n",
" <td>-2.069900e+08</td>\n",
" <td>-6.107804e-01</td>\n",
" <td>0</td>\n",
" <td>10.228465</td>\n",
" <td>391347.475503</td>\n",
" <td>0.044604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6841</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>2690</td>\n",
" <td>32363</td>\n",
" <td>-4.181221e+05</td>\n",
" <td>-12.919758</td>\n",
" <td>7572.830139</td>\n",
" <td>1.354277e+07</td>\n",
" <td>-1.396089e+07</td>\n",
" <td>-1.030874e+00</td>\n",
" <td>0</td>\n",
" <td>10.384802</td>\n",
" <td>7572.830139</td>\n",
" <td>0.032374</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_days n_transactions total_netflows \\\n",
"6832 422876 1 1 -1.229300e+01 \n",
"6833 422877 9 10 -5.860000e+02 \n",
"6834 422884 8 8 1.807000e+04 \n",
"6835 422891 1 1 -5.358000e+01 \n",
"6836 422895 1 1 -1.811520e+02 \n",
"6837 422905 208 212 -4.329218e+03 \n",
"6838 422906 1146 1556 4.455099e+03 \n",
"6839 8307 204 221 2.168303e+04 \n",
"6840 OFF DISTRIBUTION 2656 27679 1.319043e+08 \n",
"6841 PRIVATE CLIENT 2690 32363 -4.181221e+05 \n",
"\n",
" mean_flow std_flow total_subscription total_redemption \\\n",
"6832 -12.293000 NaN 0.000000e+00 -1.229300e+01 \n",
"6833 -58.600000 34788.810510 1.096470e+05 -1.102330e+05 \n",
"6834 2258.750000 7296.392333 2.044500e+04 -2.375000e+03 \n",
"6835 -53.580000 NaN 0.000000e+00 -5.358000e+01 \n",
"6836 -181.152000 NaN 0.000000e+00 -1.811520e+02 \n",
"6837 -20.420840 331.677297 9.699140e+03 -1.402836e+04 \n",
"6838 2.863174 201.071555 6.078686e+04 -5.633177e+04 \n",
"6839 98.113249 2217.940406 1.204399e+05 -9.875688e+04 \n",
"6840 4765.499704 391347.475503 3.388942e+08 -2.069900e+08 \n",
"6841 -12.919758 7572.830139 1.354277e+07 -1.396089e+07 \n",
"\n",
" churn_ratio churn_flag activity_score flow_volatility inertia_ratio \n",
"6832 -1.229300e+07 0 0.693147 0.000000 0.999640 \n",
"6833 -1.005344e+00 0 2.397895 34788.810510 0.996763 \n",
"6834 -1.161653e-01 0 2.197225 7296.392333 0.997122 \n",
"6835 -5.358000e+07 0 0.693147 0.000000 0.999640 \n",
"6836 -1.811520e+08 0 0.693147 0.000000 0.999640 \n",
"6837 -1.446351e+00 0 5.361292 331.677297 0.925180 \n",
"6838 -9.267095e-01 0 7.350516 201.071555 0.587770 \n",
"6839 -8.199681e-01 0 5.402677 2217.940406 0.926619 \n",
"6840 -6.107804e-01 0 10.228465 391347.475503 0.044604 \n",
"6841 -1.030874e+00 0 10.384802 7572.830139 0.032374 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_client.tail(10)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c61af497-1950-4a94-9b6d-51548466e133",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"491"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_aum['Product - Isin'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7f0dd0a7-5800-46b1-a10c-682d85377385",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Registrar Account - ID 15491\n",
"Product - Isin 491\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_aum[['Registrar Account - ID','Product - Isin']].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "446e199c-b6a0-47e3-ac3e-50791549353e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Product - Isin</th>\n",
" <th>n_snapshots</th>\n",
" <th>first_date</th>\n",
" <th>last_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7905</td>\n",
" <td>FR0010135103</td>\n",
" <td>80</td>\n",
" <td>2015-01-31</td>\n",
" <td>2021-08-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7905</td>\n",
" <td>FR0010147603</td>\n",
" <td>80</td>\n",
" <td>2015-01-31</td>\n",
" <td>2021-08-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7905</td>\n",
" <td>FR0010148981</td>\n",
" <td>80</td>\n",
" <td>2015-01-31</td>\n",
" <td>2021-08-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7905</td>\n",
" <td>FR0010148999</td>\n",
" <td>80</td>\n",
" <td>2015-01-31</td>\n",
" <td>2021-08-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7905</td>\n",
" <td>FR0010149096</td>\n",
" <td>80</td>\n",
" <td>2015-01-31</td>\n",
" <td>2021-08-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79654</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LU2715954330</td>\n",
" <td>3</td>\n",
" <td>2025-08-31</td>\n",
" <td>2025-10-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79655</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LU2715954504</td>\n",
" <td>4</td>\n",
" <td>2025-07-31</td>\n",
" <td>2025-10-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79656</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LU2799473124</td>\n",
" <td>13</td>\n",
" <td>2024-10-31</td>\n",
" <td>2025-10-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79657</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LU2809794220</td>\n",
" <td>12</td>\n",
" <td>2024-11-30</td>\n",
" <td>2025-10-31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79658</th>\n",
" <td>PRIVATE CLIENT</td>\n",
" <td>LU2809794576</td>\n",
" <td>2</td>\n",
" <td>2025-09-30</td>\n",
" <td>2025-10-31</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>79659 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID Product - Isin n_snapshots first_date \\\n",
"0 7905 FR0010135103 80 2015-01-31 \n",
"1 7905 FR0010147603 80 2015-01-31 \n",
"2 7905 FR0010148981 80 2015-01-31 \n",
"3 7905 FR0010148999 80 2015-01-31 \n",
"4 7905 FR0010149096 80 2015-01-31 \n",
"... ... ... ... ... \n",
"79654 PRIVATE CLIENT LU2715954330 3 2025-08-31 \n",
"79655 PRIVATE CLIENT LU2715954504 4 2025-07-31 \n",
"79656 PRIVATE CLIENT LU2799473124 13 2024-10-31 \n",
"79657 PRIVATE CLIENT LU2809794220 12 2024-11-30 \n",
"79658 PRIVATE CLIENT LU2809794576 2 2025-09-30 \n",
"\n",
" last_date \n",
"0 2021-08-31 \n",
"1 2021-08-31 \n",
"2 2021-08-31 \n",
"3 2021-08-31 \n",
"4 2021-08-31 \n",
"... ... \n",
"79654 2025-10-31 \n",
"79655 2025-10-31 \n",
"79656 2025-10-31 \n",
"79657 2025-10-31 \n",
"79658 2025-10-31 \n",
"\n",
"[79659 rows x 5 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df= df_aum.copy()\n",
"snap_count = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_snapshots=(\"Centralisation Date\", \"nunique\"),\n",
" first_date=(\"Centralisation Date\", \"min\"),\n",
" last_date=(\"Centralisation Date\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"snap_count\n",
"#pour aum y a a peut pres 12 valeurs par an (certain en ont moins), mais bcp plus de lignes dans df aum que df flows rev c est a cause de quoi"
]
},
{
"cell_type": "markdown",
"id": "514db9b1-5373-4412-8d6b-25abcbc0d741",
"metadata": {},
"source": [
"Pour chaque couple (Account, ISIN) :\n",
"n_snapshots : combien de fois lAUM est observé\n",
"first_date, last_date : période couverte\n",
"\n",
"Avant clustering, je veux savoir si :\n",
"- certains portefeuilles sont très fragmentés,\n",
"- certains comptes nont que 1 ou 2 observations (peu fiables),\n",
"- la couverture temporelle est stable"
]
},
{
"cell_type": "markdown",
"id": "000095ec-b983-4169-be11-28bbb68fc295",
"metadata": {},
"source": [
"## debut etude clustering"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e77598d7-ee1e-4932-91e5-71a190d04e7a",
"metadata": {},
"outputs": [],
"source": [
"# Parse dates\n",
"df_flows[\"Centralisation Date\"] = pd.to_datetime(df_flows[\"Centralisation Date\"], errors=\"coerce\")\n",
"df_aum[\"Centralisation Date\"] = pd.to_datetime(df_aum[\"Centralisation Date\"], errors=\"coerce\")\n",
"\n",
"ID_COL = \"Registrar Account - ID\"\n",
"FLOW_COL = \"Quantity - NetFlows\"\n",
"AUM_COL = \"Quantity - AUM\"\n",
"\n",
"# Month key\n",
"df_flows[\"month\"] = df_flows[\"Centralisation Date\"].dt.to_period(\"M\").dt.to_timestamp(\"M\")\n",
"df_aum[\"month\"] = df_aum[\"Centralisation Date\"].dt.to_period(\"M\").dt.to_timestamp(\"M\")\n",
"# Flows sont journaliers, AUM est mensuel → il faut une granularité commune."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7d933eba-1a21-486f-9518-34b43a6fee51",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(968429, 6)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>month</th>\n",
" <th>aum_qty</th>\n",
" <th>net_flow_qty</th>\n",
" <th>gross_flow_qty</th>\n",
" <th>n_tx</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7905</td>\n",
" <td>2015-01-31</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7905</td>\n",
" <td>2015-02-28</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7905</td>\n",
" <td>2015-03-31</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7905</td>\n",
" <td>2015-04-30</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7905</td>\n",
" <td>2015-05-31</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID month aum_qty net_flow_qty gross_flow_qty \\\n",
"0 7905 2015-01-31 0.0 0.0 0.0 \n",
"1 7905 2015-02-28 0.0 0.0 0.0 \n",
"2 7905 2015-03-31 0.0 0.0 0.0 \n",
"3 7905 2015-04-30 0.0 0.0 0.0 \n",
"4 7905 2015-05-31 0.0 0.0 0.0 \n",
"\n",
" n_tx \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1) Monthly aggregation for FLOWS : je fais mon etude mensuel parce que aum valeur mensuel \n",
"\n",
"ID_COL = \"Registrar Account - ID\"\n",
"FLOW_COL = \"Quantity - NetFlows\"\n",
"AUM_COL = \"Quantity - AUM\"\n",
"\n",
"df_flows_m = (\n",
" df_flows\n",
" .dropna(subset=[ID_COL, \"month\", FLOW_COL])\n",
" .assign(gross_flow_qty=lambda x: x[FLOW_COL].abs()) # absolute quantity moved\n",
" .groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" net_flow_qty=(FLOW_COL, \"sum\"), # net quantity change over the month\n",
" gross_flow_qty=(\"gross_flow_qty\", \"sum\"), # total traded quantity (activity intensity)\n",
" n_tx=(FLOW_COL, \"size\"), # number of transactions\n",
" )\n",
")\n",
"\n",
"# 2) Monthly aggregation for AUM (client-month holdings) ---\n",
"df_aum_m = (\n",
" df_aum\n",
" .dropna(subset=[ID_COL, \"month\", AUM_COL])\n",
" .groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(aum_qty=(AUM_COL, \"sum\")) # total held quantity across ISINs\n",
")\n",
"\n",
"df_month = df_aum_m.merge(df_flows_m, on=[ID_COL, \"month\"], how=\"left\")\n",
"\n",
"# 4) Months without transactions => flows are 0 ---\n",
"df_month[\"net_flow_qty\"] = df_month[\"net_flow_qty\"].fillna(0.0)\n",
"df_month[\"gross_flow_qty\"] = df_month[\"gross_flow_qty\"].fillna(0.0)\n",
"df_month[\"n_tx\"] = df_month[\"n_tx\"].fillna(0).astype(int)\n",
"\n",
"print(df_month.shape)\n",
"df_month.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "887c9159-a86f-487e-8dfe-617132a73fbe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>n_months</th>\n",
" <th>n_active_months</th>\n",
" <th>flow_freq</th>\n",
" <th>aum_qty_mean</th>\n",
" <th>aum_qty_median</th>\n",
" <th>net_flow_qty_sum</th>\n",
" <th>gross_flow_qty_sum</th>\n",
" <th>gross_flow_qty_mean</th>\n",
" <th>net_flow_qty_vol</th>\n",
" <th>rel_intensity</th>\n",
" <th>netflow_to_aum</th>\n",
" <th>n_tx_total</th>\n",
" <th>log_aum_qty_mean</th>\n",
" <th>log_gross_flow_qty_mean</th>\n",
" <th>gross_flow_to_aum</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7905</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7912</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7962</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8307</td>\n",
" <td>130</td>\n",
" <td>77</td>\n",
" <td>0.592308</td>\n",
" <td>21568.649523</td>\n",
" <td>20698.97</td>\n",
" <td>27252.124</td>\n",
" <td>177077.31</td>\n",
" <td>1362.133154</td>\n",
" <td>3508.455222</td>\n",
" <td>0.056109</td>\n",
" <td>0.011473</td>\n",
" <td>161</td>\n",
" <td>9.979042</td>\n",
" <td>7.217541</td>\n",
" <td>8.20994</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8354</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID n_months n_active_months flow_freq aum_qty_mean \\\n",
"0 7905 80 0 0.000000 0.000000 \n",
"1 7912 80 0 0.000000 0.000000 \n",
"2 7962 80 0 0.000000 0.000000 \n",
"3 8307 130 77 0.592308 21568.649523 \n",
"4 8354 64 0 0.000000 0.000000 \n",
"\n",
" aum_qty_median net_flow_qty_sum gross_flow_qty_sum gross_flow_qty_mean \\\n",
"0 0.00 0.000 0.00 0.000000 \n",
"1 0.00 0.000 0.00 0.000000 \n",
"2 0.00 0.000 0.00 0.000000 \n",
"3 20698.97 27252.124 177077.31 1362.133154 \n",
"4 0.00 0.000 0.00 0.000000 \n",
"\n",
" net_flow_qty_vol rel_intensity netflow_to_aum n_tx_total \\\n",
"0 0.000000 0.000000 0.000000 0 \n",
"1 0.000000 0.000000 0.000000 0 \n",
"2 0.000000 0.000000 0.000000 0 \n",
"3 3508.455222 0.056109 0.011473 161 \n",
"4 0.000000 0.000000 0.000000 0 \n",
"\n",
" log_aum_qty_mean log_gross_flow_qty_mean gross_flow_to_aum \n",
"0 0.000000 0.000000 0.00000 \n",
"1 0.000000 0.000000 0.00000 \n",
"2 0.000000 0.000000 0.00000 \n",
"3 9.979042 7.217541 8.20994 \n",
"4 0.000000 0.000000 0.00000 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eps = 1e-9 \n",
"\n",
"# 1) Active month indicator: did the client trade this month?\n",
"df_month[\"active_month\"] = (df_month[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"#client avec beaucoup de mois à 0 → “stable / dormant”\n",
"#client actif presque tous les mois → “rebalancer / institutionnel actif”\n",
"\n",
"\n",
"# 2) Monthly relative intensity (turnover proxy in quantity terms) : Mesurer lintensité de trading relativement à la taille et pouvoir ocmparer client petit avec client plus gros\n",
"df_month[\"rel_intensity_m\"] = df_month[\"gross_flow_qty\"] / (df_month[\"aum_qty\"].abs() + eps)\n",
"\n",
"# 3) Monthly net flow ratio (directional change): sert a Capturer la direction de la dynamique\n",
"df_month[\"netflow_to_aum_m\"] = df_month[\"net_flow_qty\"] / (df_month[\"aum_qty\"].abs() + eps)\n",
"\n",
"# 4) Aggregate to client-level features (1 row per client)\n",
"df_client_feat = (\n",
" df_month.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" # Coverage / activity\n",
" n_months=(\"month\", \"nunique\"),\n",
" n_active_months=(\"active_month\", \"sum\"),\n",
" flow_freq=(\"active_month\", \"mean\"),\n",
"\n",
" # Size in quantity terms\n",
" aum_qty_mean=(\"aum_qty\", \"mean\"),\n",
" aum_qty_median=(\"aum_qty\", \"median\"),\n",
"\n",
" # Flows in quantity terms\n",
" net_flow_qty_sum=(\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty_sum=(\"gross_flow_qty\", \"sum\"),\n",
" gross_flow_qty_mean=(\"gross_flow_qty\", \"mean\"),\n",
"\n",
" # Dispersion / volatility proxy\n",
" net_flow_qty_vol=(\"net_flow_qty\", \"std\"),\n",
" rel_intensity=(\"rel_intensity_m\", \"mean\"),\n",
" netflow_to_aum=(\"netflow_to_aum_m\", \"mean\"),\n",
"\n",
" # Trading frequency proxy\n",
" n_tx_total=(\"n_tx\", \"sum\"),\n",
" )\n",
")\n",
"\n",
"# 5) Clean NaNs due to std on constant series\n",
"df_client_feat[\"net_flow_qty_vol\"] = df_client_feat[\"net_flow_qty_vol\"].fillna(0.0)\n",
"\n",
"# 6) Log transforms (useful because distributions are heavy-tailed)\n",
"df_client_feat[\"log_aum_qty_mean\"] = np.log1p(df_client_feat[\"aum_qty_mean\"].clip(lower=0))\n",
"df_client_feat[\"log_gross_flow_qty_mean\"] = np.log1p(df_client_feat[\"gross_flow_qty_mean\"].clip(lower=0))\n",
"\n",
"# 7) Global turnover proxy\n",
"df_client_feat[\"gross_flow_to_aum\"] = df_client_feat[\"gross_flow_qty_sum\"] / (df_client_feat[\"aum_qty_mean\"].abs() + eps)\n",
"\n",
"df_client_feat.head()"
]
},
{
"cell_type": "markdown",
"id": "7650f113-fb48-41c0-87a4-5b9509086ffd",
"metadata": {},
"source": [
"Pour faire du clustering, il faut un tableau où :\n",
"- 1 ligne = 1 “objet à classer” (ici : un client / compte)\n",
"- colonnes = variables comparables entre clients\n",
"\n",
"Ce code sert à :\n",
"1- transformer une série temporelle mensuelle (client × mois)\n",
"2- en un vecteur de caractéristiques (client → features)\n",
"! utilisable en clustering."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f07707c2-480c-4dd1-8767-95d3582b3149",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(15491, 16)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_client_feat.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5e993274-eace-4527-b824-94144f155688",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered clients: (9743, 16)\n"
]
}
],
"source": [
"dfc = df_client_feat.copy()\n",
"\n",
"# Minimal filters (adjust if needed)\n",
"dfc = dfc[(dfc[\"n_months\"] >= 6)] # at least 6 observed months\n",
"dfc = dfc[(dfc[\"aum_qty_mean\"].abs() > 0)] # avoid zero holdings\n",
"print(\"Filtered clients:\", dfc.shape)\n"
]
},
{
"cell_type": "markdown",
"id": "7ff3ffbc-58d8-482f-a027-80b032897c33",
"metadata": {},
"source": [
"## Baseline clustering 1"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1a9e1bcb-8f5d-4fce-8925-ef6bfea84967",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q33: 0.05374559928443659 Q66: 25.599990882352966\n",
"seg_quantiles\n",
"High-flow 3313\n",
"Low-flow 3215\n",
"Intermediate-flow 3215\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Baseline 1 variable: average monthly gross traded quantity\n",
"x = dfc[\"gross_flow_qty_mean\"].copy()\n",
"\n",
"q33, q66 = x.quantile([0.33, 0.66])\n",
"\n",
"dfc[\"seg_quantiles\"] = pd.cut(\n",
" x,\n",
" bins=[-np.inf, q33, q66, np.inf],\n",
" labels=[\"Low-flow\", \"Intermediate-flow\", \"High-flow\"]\n",
")\n",
"\n",
"print(\"Q33:\", q33, \" Q66:\", q66)\n",
"print(dfc[\"seg_quantiles\"].value_counts(dropna=False))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5236feba-7d3e-4224-9485-0449bf35c86f",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqQAAAHHCAYAAABpzkrAAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAa19JREFUeJzt3XdYFNf7NvB7aUsH6WABREWwd1GxElHRRGPX2FtiL9FojDUa1MQeo8Yk4tfYE1s0FuwNG3YjWIItCqgoRaXuef/wZX4OCwrs6mTJ/bkuLt0zZ+Y8Z2dn9tkpZ1RCCAEiIiIiIoUYKR0AEREREf23MSElIiIiIkUxISUiIiIiRTEhJSIiIiJFMSElIiIiIkUxISUiIiIiRTEhJSIiIiJFMSElIiIiIkUxISUiIiIiRb3zhHTq1KlQqVTvuhkAQOPGjdG4cWPp9aFDh6BSqfDbb7+9l/Z79+4NLy+v99JWYaWkpKB///5wc3ODSqXCyJEjlQ6J3pOwsDCoVCqcPXv2rXVzbkv0SlxcHDp06ABHR0eoVCosWLBA2s8cOnRI6fDyJSUlBS4uLlizZo3SoehN9mf79u3belvmsmXLUKpUKaSlpeltmblRqVSYOnVqvusOHTr0ncbzb5bzvXoX6z0vXl5eaN269TtvJ7/0Hc/t27ehUqkQFhb21rrvKtcpUEKavfKz/8zNzeHh4YHg4GAsWrQIycnJegnqwYMHmDp1Ki5cuKCX5enTvzm2/Pjmm28QFhaGzz77DKtXr0aPHj2UDon07IcffsjXToUKbtSoUdizZw8mTJiA1atXo0WLFkqHVGALFy6EjY0NunTponQoBfbNN99g69at+aqr63bQu3dvpKenY/ny5YVeRmGcOHECU6dOxbNnz95ru/8Wf/75Z74TdH3766+/MHXq1PeS4JK2Qh0hnT59OlavXo2lS5di2LBhAICRI0eiUqVKuHTpkqzuV199hZcvXxZo+Q8ePMC0adMKnPTt3bsXe/fuLdA8BfWm2FasWIHo6Oh32r6uDhw4gLp162LKlCn45JNPUKNGDaVDIj1jQvruHDhwAB999BE+//xzfPLJJyhfvrzSIRVIRkYGFi5ciP79+8PY2FjpcAosr4S0R48eePnyJTw9PaUyXbcDc3Nz9OrVC/PmzYMQotDLeZuXL1/iq6++kl6fOHEC06ZN+08npNOmTct1Ws73St/++usvTJs2jQmpQgqVkLZs2RKffPIJ+vTpgwkTJmDPnj3Yt28f4uPj8eGHH8oSUBMTE5ibm+st4Ny8ePECAGBmZgYzM7N32tabmJqaQq1WK9Z+fsTHx8Pe3l7pMAosex3Tf8/z58+VDkFiqNtPth07duDRo0fo1KmT0qHolbGxMczNzfV+eVinTp1w584dHDx4UK/LfZ25uTlMTEze2fKLEkN9r/5N+7B/M71dQ9q0aVNMmjQJd+7cwa+//iqV53YNaXh4OBo0aAB7e3tYW1vD19cXX375JYBX133WqlULANCnTx/p8oDsX7qNGzdGxYoVERkZiYYNG8LS0lKaN6/r3rKysvDll1/Czc0NVlZW+PDDD3Hv3j1ZHS8vL/Tu3Vtr3teX+bbYcruu4vnz5xgzZgxKliwJtVoNX19ffPfdd1q/uLOvDdq6dSsqVqwItVqNChUqYPfu3bm/4TnEx8ejX79+cHV1hbm5OapUqYJVq1ZJ07Ovc4uJicHOnTul2N/0S3DlypVo2rQpXFxcoFar4e/vj6VLl8rqtG7dGqVLl851/oCAANSsWVNW9uuvv6JGjRqwsLCAg4MDunTporUu3rSOt23bhpCQEHh4eECtVsPHxwdff/01srKytNpfsmQJSpcuDQsLC9SuXRtHjx7N9TOSlpaGKVOmoEyZMlCr1ShZsiTGjRuXr2vHsmO9dOkSGjVqBEtLS5QpU0a6bvnw4cOoU6cOLCws4Ovri3379mkt4/z582jZsiVsbW1hbW2NZs2a4eTJk7I62ZfLHD9+HKNHj4azszOsrKzQrl07PHr0SKrn5eWFq1ev4vDhw9I6zq2/b1pGTikpKbCyssKIESO0pt2/fx/GxsYIDQ194/v05MkT9OjRA7a2trC3t0evXr1w8eJFrWuWevfuDWtra9y6dQutWrWCjY0NunfvDiD/29Kb9i/ZFi9ejAoVKsDS0hLFihVDzZo1sXbt2jzjz37/hRBYsmSJ9N6+yaZNm6TPupOTEz755BP8888/0vTt27dDpVLJzir9/vvvUKlU+Pjjj2XL8vPzQ+fOnQvUx9xs3boVXl5e8PHxyXVaxYoVYW5ujooVK2LLli1a+7S8rpfN7fqzS5cuoXfv3ihdujTMzc3h5uaGvn374smTJ7J5s78jbt68id69e8Pe3h52dnbo06eP7IeoSqXC8+fPsWrVKun9z95n57yWMK/t4O+//4ZKpcL8+fO1+n/ixAmoVCqsW7dOKqtRowYcHBywbdu2N76vixYtgrGxseyo5ty5c6FSqTB69GipLCsrCzY2Nvjiiy9k/co+RT116lSMHTsWAODt7Z3nfrqw3xP3799H27ZtYWVlBRcXF+kSlJzrND/fhwCQnp6OyZMno0aNGrCzs4OVlRUCAwO1Evjsz8d3332HH3/8ET4+PlCr1ahVqxbOnDkj1evduzeWLFkivS85t7P8Xm+7a9cuBAYGwsrKCjY2NggJCcHVq1ffOE9YWBg6duwIAGjSpInUds7P+rFjx1C7dm2Ym5ujdOnS+N///qe1HJVKhcOHD2Pw4MFwcXFBiRIlChRbbGws+vTpgxIlSkCtVsPd3R0fffRRrt/Xb4sHAP7++2907NgRDg4OsLS0RN26dbFz5843vh/Zctsv5Gb9+vWoUaMGbGxsYGtri0qVKmHhwoX5aiObXn9q9OjRA19++SX27t2LAQMG5Frn6tWraN26NSpXrozp06dDrVbj5s2bOH78OIBXO97p06dj8uTJGDhwIAIDAwEA9erVk5bx5MkTtGzZEl26dMEnn3wCV1fXN8Y1c+ZMqFQqfPHFF4iPj8eCBQsQFBSECxcuwMLCIt/9y09srxNC4MMPP8TBgwfRr18/VK1aFXv27MHYsWPxzz//aO0Ujx07hs2bN2Pw4MGwsbHBokWL0L59e9y9exeOjo55xvXy5Us0btwYN2/exNChQ+Ht7Y1Nmzahd+/eePbsGUaMGAE/Pz+sXr0ao0aNQokSJTBmzBgAgLOzc57LXbp0KSpUqIAPP/wQJiYm+OOPPzB48GBoNBoMGTIEANC5c2f07NkTZ86ckZJ1ALhz5w5OnjyJb7/9ViqbOXMmJk2ahE6dOqF///549OgRFi9ejIYNG+L8+fOyI095reOwsDBYW1tj9OjRsLa2xoEDBzB58mQkJSXJ2lq6dCmGDh2KwMBAjBo1Crdv30bbtm1RrFgx2c5Bo9Hgww8/xLFjxzBw4ED4+fnh8uXLmD9/Pq5fv56v69WePn2K1q1bo0uXLujYsSOWLl2KLl26YM2aNRg5ciQ+/fRTdOvWDd9++y06dOiAe/fuwcbGBsCr7SEwMBC2trYYN24cTE1NsXz5cjRu3FhKZl83bNgwFCtWDFOmTMHt27exYMECDB06FBs2bAAALFiwAMOGDYO1tTUmTpwIAFrbx9uWkZO1tTXatWuHDRs2YN68ebJTvevWrYMQQkoac6PRaNCmTRucPn0an332GcqXL49t27ahV69eudbPzMxEcHAwGjRogO+++w6Wlpb53pbetn8BXl1aM3z4cHTo0AEjRoxAamoqLl26hFOnTqFbt265xtSwYUPpmusPPvgAPXv2zLO/wKvPaZ8+fVCrVi2EhoYiLi4OCxcuxPHjx6XPeoMGDaBSqXDkyBFUrlwZAHD06FEYGRnh2LFj0rIePXqEqKgo6WaW/PQxLydOnED16tW1yvfu3Yv27dvD398foaGhePLkifSFWFjh4eH4+++/0adPH7i5ueHq1av48ccfcfXqVZw8eVIroe/UqRO8vb0RGhqKc+fO4aeffoKLiwtmz54NAFi9ejX69++P2rVrY+DAgQCQa2IN5L0dlC5dGvXr18eaNWswatQ
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"plt.hist(dfc[\"gross_flow_qty_mean\"], bins=100)\n",
"plt.axvline(q33, linestyle=\"--\")\n",
"plt.axvline(q66, linestyle=\"--\")\n",
"plt.xlabel(\"Average monthly gross flow (quantity)\")\n",
"plt.ylabel(\"Count\")\n",
"plt.title(\"Distribution of average monthly gross flows (quantity) with quantile thresholds\")\n",
"plt.show()\n",
"\n",
"#X= activite moyenen mensuelle , Y = combien de client ont cette valeurde X"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "4e6a7679-386a-468c-84f8-5179eab21f82",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAZeJJREFUeJzt3XdYFNf7NvB7WXpZEKQKgiVRUGzYsBeUKBo1aixE0ViiwUpi+yb2WKKJ3VhS1BiNJYnGiA0bNmJBsUvUYBcwFlYEKbvn/cOX+bkCgggMOvfnuva6ds6cPfPM7uzss2fOzKiEEAJERERECmYkdwBEREREcmNCRERERIrHhIiIiIgUjwkRERERKR4TIiIiIlI8JkRERESkeEyIiIiISPGYEBEREZHiMSEiIiIixWNCpDAqlQqTJk2SpleuXAmVSoVr164V+bL79OkDLy8vafratWtQqVT45ptvinzZADBp0iSoVKpiWVZBZWZmYvTo0fDw8ICRkRE6duwod0hUTPbv3w+VSoXffvstz7ovfpfomeTkZPTv3x8uLi5QqVQYMWKEtJ9ZuXKl3OHli16vR9WqVTFt2jS5Qyk0Wdv2/v37C63NHTt2wNraGvfu3Su0NpkQFZKsxOLEiRNyh1LkUlJSMGnSpELduAtLSY4tP3766SfMnj0bXbp0wapVqzBy5Ei5Q6JCtnbtWsybN0/uMN5K06dPx8qVKzF48GCsXr0avXr1kjukV/brr7/i5s2bGDJkiNyhvLLvvvsu34nn634P3nvvPVSsWBEzZswocBvZCCoUK1asEADE8ePH5Q7lpQCIiRMnStOZmZkiNTVV6PX6fLdx7969bO3kR3p6unj69Kk0HRcXJwCI2bNnv1I7BY0tIyNDpKamFtqyikK3bt1EmTJl5A6DilBQUJDw9PTMVr5v3z4BQGzcuDHPNkJCQnJsQ+nq1asnGjZsaFCWtZ9ZsWKFPEG9ourVq4uBAwfKHUaBVKlSRTRt2jRbuU6nE6mpqUKn00lluX0PXsV3330nLC0thVarfa12srCHSOHUajXMzc2L9FDSkydPAAAmJiYwMzMrsuXkxdjYGObm5rItPz8SExNhZ2cndxivLOszJuUpSZ/9m/r9yXLq1CmcPn0aH374odyhFCojIyOYm5vDyKhwU47OnTsjLS0NGzduLJT2mBAVs1OnTqFNmzbQaDSwtrZGy5Yt8ffff2erd+bMGTRt2hQWFhZwd3fHV199hRUrVuR7vE9aWhpGjhwJR0dH2NjY4P3338etW7ey1ctpDNGJEycQGBiI0qVLw8LCAuXKlcPHH38M4Nm4H0dHRwDA5MmToVKpDMYl9enTB9bW1rh69Sratm0LGxsbBAcHS/NyG/cwd+5ceHp6wsLCAk2bNsW5c+cM5jdr1gzNmjXL9rrn28wrtpzGEGVmZmLq1KmoUKECzMzM4OXlhf/9739IS0szqOfl5YV27drh0KFDqFu3LszNzVG+fHn8/PPPOa7Pi548eYLPPvsMHh4eMDMzQ6VKlfDNN99ACCHFrlKpsG/fPpw/f16K/WWH/v78808EBQXBzc0NZmZmqFChAqZOnQqdTifVGTJkCKytrZGSkpLt9T169ICLi4tB/e3bt6Nx48awsrKCjY0NgoKCcP78eYPXvewzPnjwILp27YqyZcvCzMwMHh4eGDlyJFJTU7Mtf+PGjfDx8YG5uTmqVq2KTZs25biN6PV6zJs3D1WqVIG5uTmcnZ3xySef4OHDh3m+71mx3rhxA+3atYO1tTXKlCmDxYsXAwDOnj2LFi1awMrKCp6enli7dm22Nv7991907doV9vb2sLS0RP369REeHm5QJ2uMxIYNGzBt2jS4u7vD3NwcLVu2xJUrV6R6zZo1Q3h4OK5fvy59xjmt78vaeJEQAl5eXujQoUO2eU+fPoWtrS0++eSTl75PqampGDZsGEqXLi3tL27fvp1tzGHWd+jChQvo2bMnSpUqhUaNGgHI/3fpZfuXLOvWrYOfnx9sbGyg0Wjg6+uL+fPn5xp/1vsfFxeH8PBw6b192b5y79690rZuZ2eHDh064OLFi9L8M2fOQKVSYcuWLVJZdHQ0VCoVatWqZdBWmzZtUK9evVdax5xs3rwZpqamaNKkSbZ5hw4dQp06dWBubo4KFSpg2bJl2fZpLxsv9eJnef36dXz66aeoVKkSLCws4ODggK5du2Z7z7J+Iw4fPoywsDA4OjrCysoKnTp1Mhi/4+XlhfPnzyMyMlJ6/7P22S+OIcrte5CcnAwrKysMHz48W/y3bt2CWq02OETm5OSEatWq4c8//8zzvc0P40JphfLl/PnzaNy4MTQaDUaPHg0TExMsW7YMzZo1Q2RkpPSFun37Npo3bw6VSoVx48bBysoKP/zwwyv1rvTv3x+//PILevbsiQYNGmDv3r0ICgrK83WJiYlo3bo1HB0dMXbsWNjZ2eHatWv4448/AACOjo5YsmQJBg8ejE6dOuGDDz4AAFSrVk1qIzMzE4GBgWjUqBG++eYbWFpavnSZP//8Mx4/fozQ0FA8ffoU8+fPR4sWLXD27Fk4Ozvne53zE9uL+vfvj1WrVqFLly747LPPcPToUcyYMQMXL17Epk2bDOpeuXIFXbp0Qb9+/RASEoKffvoJffr0gZ+fH6pUqZLrMoQQeP/997Fv3z7069cPNWrUwM6dOzFq1Cjcvn0bc+fOhaOjI1avXo1p06YhOTlZ+tJ7e3vn2u7KlSthbW2NsLAwWFtbY+/evZgwYQK0Wi1mz54NAOjWrRsWL16M8PBwdO3aVXptSkoK/vrrL/Tp0wdqtRoAsHr1aoSEhCAwMBBff/01UlJSsGTJEjRq1AinTp0y+OHO7TPeuHEjUlJSMHjwYDg4OODYsWNYuHAhbt26ZfAvLjw8HN26dYOvry9mzJiBhw8fol+/fihTpky29fzkk0+wcuVK9O3bF8OGDUNcXBwWLVqEU6dO4fDhwzAxMcn1PQIAnU6HNm3aoEmTJpg1axbWrFmDIUOGwMrKCl988QWCg4PxwQcfYOnSpejduzf8/f1Rrlw5AEBCQgIaNGiAlJQUDBs2DA4ODli1ahXef/99/Pbbb+jUqZPBsmbOnAkjIyN8/vnnSEpKwqxZsxAcHIyjR48CAL744gskJSXh1q1bmDt3LgDA2tr6ldp4kUqlwkcffYRZs2bhwYMHsLe3l+b99ddf0Gq1+Oijj176HvXp0wcbNmxAr169UL9+fURGRr50f9G1a1e88847mD59upTU5+e7lNf+BQAiIiLQo0cPtGzZEl9//TUA4OLFizh8+HCOP5TAs+/J6tWrMXLkSLi7u+Ozzz4D8GyfkNOg2927d6NNmzYoX748Jk2ahNTUVCxcuBANGzbEyZMn4eXlhapVq8LOzg4HDhzA+++/D+BZwm9kZITTp09Dq9VCo9FAr9fjyJEjGDhwYL7XMTdHjhxB1apVs23TZ8+eldqcNGkSMjMzMXHixFfaP77o+PHjOHLkCLp37w53d3dcu3YNS5YsQbNmzXDhwoVs++2hQ4eiVKlSmDhxIq5du4Z58+ZhyJAhWL9+PQBg3rx5GDp0KKytrfHFF18AQK7x5fY9sLa2RqdOnbB+/XrMmTNH2jcBz8ZWCSGkP19Z/Pz8sHnz5gK/DwYK5cAb5WsMUceOHYWpqam4evWqVHbnzh1hY2MjmjRpIpUNHTpUqFQqcerUKans/v37wt7eXgAQcXFxL40lJiZGABCffvqpQXnPnj2zja/JijurzU2bNuW5Hi8bpxMSEiIAiLFjx+Y47/ljxlnH9i0sLMStW7ek8qNHjwoAYuTIkVJZ06ZNczw2/WKbL4tt4sSJ4vlNPut96t+/v0G9zz//XAAQe/fulco8PT0FAHHgwAGpLDExUZiZmYnPPvss27Ket3nzZgFAfPXVVwblXbp0ESqVSly5csVgPatUqfLS9rKkpKRkK/vkk0+EpaWlNFZLr9eLMmXKiM6dOxvU27Bhg8H6PH78WNjZ2YkBAwYY1IuPjxe2trYG5S/7jHOKacaMGUKlUonr169LZb6+vsLd3V08fvxYKtu
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"plt.hist(np.log1p(dfc[\"gross_flow_qty_mean\"]), bins=100)\n",
"plt.axvline(np.log1p(q33), linestyle=\"--\")\n",
"plt.axvline(np.log1p(q66), linestyle=\"--\")\n",
"plt.xlabel(\"log(1 + avg monthly gross flow) (quantity)\")\n",
"plt.ylabel(\"Count\")\n",
"plt.title(\"Log-distribution of average monthly gross flows (quantity)\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "658f167f-1e8d-4c34-ae7e-bd1d4759d5c4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([415., 187., 172., 173., 162., 136., 125., 126., 112., 136., 134.,\n",
" 153., 142., 150., 154., 158., 163., 174., 135., 133., 153., 128.,\n",
" 130., 139., 120., 116., 118., 101., 82., 89., 100., 90., 82.,\n",
" 104., 86., 104., 90., 82., 87., 70., 78., 85., 66., 62.,\n",
" 91., 74., 63., 78., 62., 70., 51., 65., 48., 44., 42.,\n",
" 53., 52., 43., 35., 35., 22., 27., 29., 25., 21., 21.,\n",
" 25., 15., 19., 9., 8., 7., 9., 7., 4., 7., 4.,\n",
" 1., 4., 2., 2., 1., 0., 5., 2., 2., 0., 2.,\n",
" 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,\n",
" 1.]),\n",
" array([1.24999219e-05, 1.51704407e-01, 3.03396315e-01, 4.55088222e-01,\n",
" 6.06780130e-01, 7.58472037e-01, 9.10163945e-01, 1.06185585e+00,\n",
" 1.21354776e+00, 1.36523967e+00, 1.51693157e+00, 1.66862348e+00,\n",
" 1.82031539e+00, 1.97200730e+00, 2.12369920e+00, 2.27539111e+00,\n",
" 2.42708302e+00, 2.57877493e+00, 2.73046683e+00, 2.88215874e+00,\n",
" 3.03385065e+00, 3.18554256e+00, 3.33723446e+00, 3.48892637e+00,\n",
" 3.64061828e+00, 3.79231019e+00, 3.94400209e+00, 4.09569400e+00,\n",
" 4.24738591e+00, 4.39907782e+00, 4.55076972e+00, 4.70246163e+00,\n",
" 4.85415354e+00, 5.00584545e+00, 5.15753735e+00, 5.30922926e+00,\n",
" 5.46092117e+00, 5.61261308e+00, 5.76430498e+00, 5.91599689e+00,\n",
" 6.06768880e+00, 6.21938071e+00, 6.37107261e+00, 6.52276452e+00,\n",
" 6.67445643e+00, 6.82614834e+00, 6.97784024e+00, 7.12953215e+00,\n",
" 7.28122406e+00, 7.43291597e+00, 7.58460787e+00, 7.73629978e+00,\n",
" 7.88799169e+00, 8.03968360e+00, 8.19137550e+00, 8.34306741e+00,\n",
" 8.49475932e+00, 8.64645123e+00, 8.79814313e+00, 8.94983504e+00,\n",
" 9.10152695e+00, 9.25321886e+00, 9.40491076e+00, 9.55660267e+00,\n",
" 9.70829458e+00, 9.85998649e+00, 1.00116784e+01, 1.01633703e+01,\n",
" 1.03150622e+01, 1.04667541e+01, 1.06184460e+01, 1.07701379e+01,\n",
" 1.09218298e+01, 1.10735217e+01, 1.12252137e+01, 1.13769056e+01,\n",
" 1.15285975e+01, 1.16802894e+01, 1.18319813e+01, 1.19836732e+01,\n",
" 1.21353651e+01, 1.22870570e+01, 1.24387489e+01, 1.25904408e+01,\n",
" 1.27421327e+01, 1.28938246e+01, 1.30455165e+01, 1.31972085e+01,\n",
" 1.33489004e+01, 1.35005923e+01, 1.36522842e+01, 1.38039761e+01,\n",
" 1.39556680e+01, 1.41073599e+01, 1.42590518e+01, 1.44107437e+01,\n",
" 1.45624356e+01, 1.47141275e+01, 1.48658194e+01, 1.50175113e+01,\n",
" 1.51692032e+01]),\n",
" <BarContainer object of 100 artists>)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAKVRJREFUeJzt3X90VPWd//FXfpCBADNpgskkSwJUrRAISEHDFNdlJRJCSuUQt8VSSJEDR85ghXQpxgUUrAZpVxBXofZ0gT1LFuse0QUXMASMdQk/DM3yS1NhqcGFSawsGYiHJCT3+8cu8+1A+DFhhvlk8nycc8/Jvfczd96fg5159fP53DtRlmVZAgAAMEh0uAsAAAC4EgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGCc2HAX0BFtbW06ffq0evfuraioqHCXAwAAboJlWTp//rzS0tIUHX39MZJOGVBOnz6t9PT0cJcBAAA64NSpU+rbt+9123TKgNK7d29J/9tBu90e5moAAMDN8Hq9Sk9P932PX0+nDCiXp3XsdjsBBQCATuZmlmewSBYAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOLHhLsBE/Z9+z2//j8vzw1QJAABdEyMoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxbimgLF++XFFRUZo3b57v2MWLF+V2u5WUlKRevXqpoKBAdXV1fq+rra1Vfn6+4uPjlZycrAULFujSpUu3UgoAAIggHQ4oBw4c0K9+9SsNHTrU7/j8+fO1ZcsWvfXWW6qoqNDp06c1efJk3/nW1lbl5+erublZe/bs0YYNG7R+/XotWbKk470AAAARpUMB5cKFC5o6dap+/etf6xvf+IbveENDg37zm9/o5Zdf1kMPPaQRI0Zo3bp12rNnj/bu3StJev/993Xs2DH98z//s+69917l5eXp+eef12uvvabm5ubg9AoAAHRqHQoobrdb+fn5ysnJ8TteVVWllpYWv+MDBw5URkaGKisrJUmVlZXKyspSSkqKr01ubq68Xq+OHj3a7vs1NTXJ6/X6bQAAIHIF/GOBmzZt0sGDB3XgwIGrznk8HsXFxSkhIcHveEpKijwej6/Nn4eTy+cvn2tPSUmJli5dGmipAACgkwpoBOXUqVN66qmntHHjRnXv3j1UNV2luLhYDQ0Nvu3UqVO37b0BAMDtF1BAqaqqUn19vb797W8rNjZWsbGxqqio0OrVqxUbG6uUlBQ1Nzfr3Llzfq+rq6uT0+mUJDmdzqvu6rm8f7nNlWw2m+x2u98GAAAiV0ABZezYsTp8+LCqq6t928iRIzV16lTf3926dVN5ebnvNTU1NaqtrZXL5ZIkuVwuHT58WPX19b42ZWVlstvtyszMDFK3AABAZxbQGpTevXtryJAhfsd69uyppKQk3/GZM2eqqKhIiYmJstvtevLJJ+VyuTRq1ChJ0rhx45SZmalp06ZpxYoV8ng8WrRokdxut2w2W5C6BQAAOrOAF8neyMqVKxUdHa2CggI1NTUpNzdXr7/+uu98TEyMtm7dqjlz5sjlcqlnz54qLCzUsmXLgl0KAADopKIsy7LCXUSgvF6vHA6HGhoaQrIepf/T7/nt/3F5ftDfAwCAriaQ729+iwcAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYJyAAsqaNWs0dOhQ2e122e12uVwubdu2zXd+zJgxioqK8tueeOIJv2vU1tYqPz9f8fHxSk5O1oIFC3Tp0qXg9AYAAESE2EAa9+3bV8uXL9fdd98ty7K0YcMGPfLII/r973+vwYMHS5JmzZqlZcuW+V4THx/v+7u1tVX5+flyOp3as2ePzpw5o+nTp6tbt2568cUXg9QlAADQ2QUUUCZOnOi3/8ILL2jNmjXau3evL6DEx8fL6XS2+/r3339fx44d086dO5WSkqJ7771Xzz//vBYuXKjnnntOcXFxHewGAACIJB1eg9La2qpNmzapsbFRLpfLd3zjxo3q06ePhgwZouLiYn399de+c5WVlcrKylJKSorvWG5urrxer44ePXrN92pqapLX6/XbAABA5ApoBEWSDh8+LJfLpYsXL6pXr17avHmzMjMzJUk//OEP1a9fP6WlpenQoUNauHChampq9Pbbb0uSPB6PXziR5Nv3eDzXfM+SkhItXbo00FIBAEAnFXBAueeee1RdXa2Ghgb967/+qwoLC1VRUaHMzEzNnj3b1y4rK0upqakaO3asTpw4oTvvvLPDRRYXF6uoqMi37/V6lZ6e3uHrAQAAswU8xRMXF6e77rpLI0aMUElJiYYNG6ZXXnml3bbZ2dmSpOPHj0uSnE6n6urq/Npc3r/WuhVJstlsvjuHLm8AACBy3fJzUNra2tTU1NTuuerqaklSamqqJMnlcunw4cOqr6/3tSkrK5PdbvdNEwEAAAQ0xVNcXKy8vDxlZGTo/PnzKi0t1QcffKAdO3boxIkTKi0t1YQJE5SUlKRDhw5p/vz5evDBBzV06FBJ0rhx45SZmalp06ZpxYoV8ng8WrRokdxut2w2W0g6CAAAOp+AAkp9fb2mT5+uM2fOyOFwaOjQodqxY4cefvhhnTp1Sjt37tSqVavU2Nio9PR0FRQUaNGiRb7Xx8TEaOvWrZozZ45cLpd69uypwsJCv+emAAAARFmWZYW7iEB5vV45HA41NDSEZD1K/6ff89v/4/L8oL8HAABdTSDf3/wWDwAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgnIACypo1azR06FDZ7XbZ7Xa5XC5t27bNd/7ixYtyu91KSkpSr169VFBQoLq6Or9r1NbWKj8/X/Hx8UpOTtaCBQt06dKl4PQGAABEhIACSt++fbV8+XJVVVXp448/1kMPPaRHHnlER48elSTNnz9fW7Zs0VtvvaWKigqdPn1akydP9r2+tbVV+fn5am5u1p49e7RhwwatX79eS5YsCW6vAABApxZlWZZ1KxdITEzUL37xCz366KO64447VFpaqkcffVSS9Omnn2rQoEGqrKzUqFGjtG3bNn33u9/V6dOnlZKSIklau3atFi5cqC+//FJxcXE39Z5er1cOh0MNDQ2y2+23Un67+j/9nt/+H5fnB/09AADoagL5/u7wGpTW1lZt2rRJjY2NcrlcqqqqUktLi3JycnxtBg4cqIyMDFVWVkqSKisrlZWV5QsnkpSbmyuv1+sbhWlPU1OTvF6v3wYAACJXwAHl8OHD6tWrl2w2m5544glt3rxZmZmZ8ng8iouLU0JCgl/7lJQUeTweSZLH4/ELJ5fPXz53LSUlJXI4HL4tPT090LIBAEAnEnBAueeee1RdXa19+/Zpzpw5Kiws1LFjx0JRm09xcbEaGhp826lTp0L6fgAAILxiA31BXFyc7rrrLknSiBEjdODAAb3yyiv6wQ9+oObmZp07d85vFKWurk5Op1OS5HQ6tX//fr/rXb7L53Kb9thsNtlstkBLBQAAndQtPwelra1NTU1NGjFihLp166by8nLfuZqaGtXW1srlckmSXC6
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(np.log1p(dfc.loc[dfc[\"gross_flow_qty_mean\"]>0, \"gross_flow_qty_mean\"]), bins=100)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "42eb180e-a512-4d5c-8179-91b88e3067df",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_8776/4281614443.py:3: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.\n",
" plt.boxplot(\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAGzCAYAAABwyVA7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAdshJREFUeJzt3Xl8TPf+P/DXZMgispDIhiQktkgqxBpCQjS1p7HVGlGqbdBKUbRFey0ttbUGl9L0KqU0aKkWITdBehGidkEstSSxZBWJzJzfH34zXyOTZCaZycxkXs/HIw/mnM+c856ZkzPvfFaRIAgCiIiIiMggmek7ACIiIiIqG5M1IiIiIgPGZI2IiIjIgDFZIyIiIjJgTNaIiIiIDBiTNSIiIiIDxmSNiIiIyIAxWSMiIiIyYEzWiIiIiAyYXpI1T09PjBs3TvE4ISEBIpEICQkJ+gin0mJjYyESiXDz5k19h4Jx48bB09NTaZtIJML8+fOVtp08eRKBgYGwtraGSCRCamoqAOCPP/6Av78/LC0tIRKJkJ2dXS1xa9ur11ZZqvLZzZ8/HyKRSPPgtOjmzZsQiUSIjY1Vu+zXX39dYVldvLaMjAwMGTIEDg4OEIlEWLlypUH9zqv6PTFW6l7/ZPj69u2LiRMn6jsMrdHknlUTPX/+HI0bN8aaNWsq9XytJmvXr1/HpEmT0LRpU1haWsLW1hZdu3bFqlWrUFhYqM1TVcqiRYuwe/dufYehN8+fP8fQoUPx+PFjrFixAps3b4aHhwcePXqEYcOGwcrKChKJBJs3b4a1tbW+wyUN/f777waZdEybNg1//vknZs+ejc2bN+ONN97Qd0haY+j3lIsXL2L+/PkG8QdlTXTv3j3Mnz9f8Uevthw7dgwHDhzAxx9/rNXjVoetW7di5cqVapXV9T1LKpXCzc0NIpEI+/fvV1lm3LhxqFu3bpnHqFu3rsrKJZFIhB9//FHlc7p27QqRSARfX1/Fttq1ayMmJgYLFy7Es2fPNH8xgpbs3btXsLKyEuzt7YWpU6cK69evF1avXi289dZbQu3atYWJEycqynp4eAiRkZGKx1KpVCgsLBSkUqm2wlHJ2tpa6bxVVVJSIhQWFgoymUxrx6ysyMhIwcPDQ2lbYWGh8Pz5c8XjS5cuCQCEDRs2KJXbv3+/AEA4ePBgdYSqU8+ePROKi4srLPf9998LAIT09HSNzzFv3jxBi786lSKTyYTCwkKhpKREsS06OlplXOnp6QIAYenSpRUe9/nz50JhYaFWY3V2dhZGjRqltO3IkSMCAOHIkSNaPVdlvPp7oglt31Oq6tXrf8eOHQbzPtdEJ0+eFAAI33//vVaPO2jQIOH111/X6jGrS79+/Up9FwmCZvcsbTlw4IAAQPD09Cx1D5KLjIwUrK2tyzzGq7/j8nuXpaWl0KdPn1Ll5fdbS0tLoXXr1kr7njx5IpibmwsbN27U+LXU0jy9Ky09PR1vvfUWPDw8cPjwYbi6uir2RUdH49q1a9i3b1+ZzzczM4OlpaU2QqlWYrEYYrFY32GU6dX3NDMzEwBgb2+v1vaqKCgo0EvtnIWFRbWfUx9EIpFOfmdq1aqFWrW0cltQyMzM1Oq1pW3GeO8pi6lc/zVZZmYm9u3bh3Xr1uk7FK3S1T2rPD/++CPatWuHyMhIzJkzR6vfS3379sWvv/6Khw8fwtHRUbF969atcHZ2RrNmzfDkyROl59jb2+P1119HbGwsxo8fr9kJNU7vVHj33XcFAMKxY8fUKv9qzVpZf2X/9ddfQlhYmGBraytYWVkJ3bt3F44ePapURl7LkZaWJkRGRgp2dnaCra2tMG7cOKGgoEBRDkCpn4r+Iv7mm28EHx8fRY1hQECAsGXLFsX+V2tn5LFUdC6pVCqsWLFC8PHxESwsLAQnJyfhnXfeER4/fqzW+7dr1y6hdevWgoWFhdC6dWshLi5OZc0aAGHevHmCILz46+HVmHr06CH06NGj3Fg1+QwuXLggjBgxQrC3txf8/f0FQRCE+/fvC+PGjRMaNmwomJubCy4uLsLAgQPLrdHas2ePAEA4e/asYtvOnTsFAMKbb76pVLZly5bCsGHDFI9fvbYEQRDOnz8vhISECJaWlkLDhg2Ff/3rX8LGjRtV1qz9/vvvQrdu3YQ6deoIdevWFfr27SucP39e5euV6969u/Daa6+pfC3Nmzcv9y/kadOmCfXr11eqnZ08ebIAQFi1apVi24MHDwQAwpo1awRB+L+/3uR/0av6fOUxvlyz9u9//1to2rSpYG5uLrRv3144ceJEua9NEF5cR9HR0YrrztzcXPDx8RH2799f5usShP/7/VAVU1m/8z///LPQrl07wdLSUnBwcBBGjRol/PPPP4r9Vbk2yvLy78nL70FV7yn//POPEBUVJTg5OSnes1f/opa/D9u3bxcWLFggNGzYULCwsBB69uwppKWlKZW9evWqEBERITg7OwsWFhZCw4YNheHDhwvZ2dmKMi9f/2W9/0eOHBHGjh0rODg4qKyF7t27t9C8efNy3zN1YhEEQdi8ebPi86xXr54wfPhw4fbt26WOt3r1aqFJkyaCpaWl0KFDByExMVFxf1L1Xs2fP19wc3MT6tatKwwePFjIzs4Wnj17JnzwwQdCgwYNBGtra2HcuHHCs2fPSp1LnZh69OghtG7dWrhw4YIQHBwsWFlZCW5ubsJXX31VKp5Xf+S/k+q+R6/atGmTAEC4efNmqX3q3stevablXr0/Pnr0SPjoo48EX19fwdraWrCxsRHeeOMNITU1Vel56l6nqr5P5N9L6t6zZDKZ4OHhIQwcOLBU/IWFhYKtra3wzjvvlPseCoIgPH36VLCxsRGWLFki3L9/XzAzM1P6/parbM3aDz/8IFhbWyvuyXKtW7cWpkyZoriGXrVq1SpBJBIJjx49qvA1vEwrf0L/9ttvaNq0KQIDA7VxOADA4cOH0adPHwQEBGDevHkwMzPD999/j549eyIpKQkdO3ZUKj9s2DA0adIEixcvxunTp/Hdd9/ByckJX331FQBg8+bNmDBhAjp27Ih33nkHAODl5VXm+Tds2ICpU6diyJAh+OCDD/Ds2TP8/fff+N///oeRI0eqfE5ERAS8vb2VtqWkpGDlypVwcnJSbJs0aRJiY2MRFRWFqVOnIj09HatXr8aZM2dw7Ngx1K5du8y4Dhw4gMGDB8PHxweLFy/Go0ePEBUVhUaNGpX7fk6aNAkNGzbEokWLMHXqVHTo0AHOzs4AgBYtWmD9+vX44osv0KRJE8X7oulnMHToUDRr1gyLFi2CIAgAgMGDB+PChQuYMmUKPD09kZmZiYMHD+L27dulBkTIdevWDSKRCImJiXjttdcAAElJSTAzM8PRo0cV5bKysnD58mVMnjy5zNf94MEDhISEoKSkBLNmzYK1tTXWr18PKyurUmU3b96MyMhIhIWF4auvvsLTp0+xdu1adOvWDWfOnCkz3jFjxmDixIk4f/68Uh+FkydP4urVq/j000/LjC8oKAgrVqzAhQsXFM+Vv9akpCRMnTpVsQ0AunfvrvI4kyZNwr1793Dw4EFs3rxZZZmtW7ciLy8PkyZNgkgkwpIlSxAREYEbN26Ue80BwNGjRxEXF4f3338fNjY2+OabbzB48GDcvn0bDg4OKp/TvXt3bN68GWPGjEHv3r0xduzYcs8h/53o0KEDFi9ejIyMDKxatQrHjh3DmTNnYG9vr9VroyJVuadkZGSgc+fOEIlEmDx5Mho0aID9+/fj7bffRm5uLj788EOlc3355ZcwMzPD9OnTkZOTgyVLlmDUqFH43//+BwAoLi5GWFgYioqKMGXKFLi4uODu3bvYu3cvsrOzYWdnVyr+7t27Y+rUqfjmm28wZ84ctGrVCgDQqlUrjBkzBv/5z3/w559/on///ornPHjwAIcPH8a8efPKfF/UjWXhwoX47LPPMGzYMEyYMAFZWVn49ttv0b17d8XnCQBr167F5MmTERQUhGnTpuHmzZsIDw9HvXr1VN7XFi9eDCsrK8yaNQvXrl3Dt99+i9q1a8PMzAxPnjzB/Pnz8dd
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"' Ce que tu vois sur ton graphe\\n\\nLes trois segments contiennent des outliers énormes.\\nIl y a une forte dispersion dans chaque segment.\\nMême dans Low-flow et Mid-flow, tu as des comptes avec AUM très grandes (points très haut).\\nEt inversement, dans High-flow, tu as aussi des comptes relativement petits.\\n\\nCest ça qui justifie la phrase :\\n“Flow intensity alone does not uniquely define client type because accounts of very different sizes may fall in the same flow segment.”\\n'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#limite client size \n",
"plt.figure()\n",
"plt.boxplot(\n",
" [dfc.loc[dfc[\"seg_quantiles\"]==s, \"aum_qty_mean\"].dropna()\n",
" for s in [\"Low-flow\",\"Intermediate-flow\",\"High-flow\"]],\n",
" labels=[\"Low\",\"Mid\",\"High\"]\n",
")\n",
"plt.yscale(\"log\")\n",
"plt.ylabel(\"Mean AUM (quantity) [log scale]\")\n",
"plt.title(\"Client size differs widely within flow-intensity segments (quantity AUM)\")\n",
"plt.show()\n",
"\n",
"\n",
"''' Ce que tu vois sur ton graphe\n",
"\n",
"Les trois segments contiennent des outliers énormes.\n",
"Il y a une forte dispersion dans chaque segment.\n",
"Même dans Low-flow et Mid-flow, tu as des comptes avec AUM très grandes (points très haut).\n",
"Et inversement, dans High-flow, tu as aussi des comptes relativement petits.\n",
"\n",
"Cest ça qui justifie la phrase :\n",
"“Flow intensity alone does not uniquely define client type because accounts of very different sizes may fall in the same flow segment.”\n",
"'''"
]
},
{
"cell_type": "markdown",
"id": "ae004a9f-3a5b-4570-a07c-2f4882fff3c0",
"metadata": {},
"source": [
"## Baseline 2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "0e10f5a2-d678-484e-8789-a907635a5a20",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 9743.000000\n",
"mean 59.264703\n",
"std 38.537320\n",
"min 6.000000\n",
"25% 26.000000\n",
"50% 50.000000\n",
"75% 80.000000\n",
"max 130.000000\n",
"Name: n_months, dtype: float64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfc[\"n_months\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "0eb64794-a3b2-4aa3-a43c-62a39eab35e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"seg_2D\n",
"Highly active (high int, high freq) 3624\n",
"Dormant (low int, low freq) 3588\n",
"Small rebalancers (low int, high freq) 1283\n",
"Occasional large movers (high int, low freq) 1248\n",
"Name: count, dtype: int64\n",
"thr_int: 1.7333333333210905 thr_freq: 0.041666666666666664\n"
]
}
],
"source": [
"dfc[\"rel_intensity_total\"] = dfc[\"gross_flow_to_aum\"] # turnover proxy\n",
"dfc[\"frequency\"] = dfc[\"flow_freq\"] # share of active months\n",
"\n",
"# Thresholds: medians (simple + explainable)\n",
"thr_int = dfc[\"rel_intensity_total\"].median()\n",
"thr_freq = dfc[\"frequency\"].median()\n",
"\n",
"def quadrant(row):\n",
" low_int = row[\"rel_intensity_total\"] < thr_int\n",
" low_frq = row[\"frequency\"] < thr_freq\n",
"\n",
" if low_int and low_frq:\n",
" return \"Dormant (low int, low freq)\"\n",
" if low_int and (not low_frq):\n",
" return \"Small rebalancers (low int, high freq)\"\n",
" if (not low_int) and low_frq:\n",
" return \"Occasional large movers (high int, low freq)\"\n",
" return \"Highly active (high int, high freq)\"\n",
"\n",
"dfc[\"seg_2D\"] = dfc.apply(quadrant, axis=1)\n",
"\n",
"print(dfc[\"seg_2D\"].value_counts())\n",
"print(\"thr_int:\", thr_int, \"thr_freq:\", thr_freq)\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "2850c428-c9bf-42fa-87d1-fd8d5d10b016",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA/4pJREFUeJzsnXd8FNUWx3+bsum9kpBC70noCkKoIkgRFIKFKogo0lWwUEQFQRQfRUAEfFgIIE2fSok0EQstAQWkhdASCCEJ6cnufX+ss9kyszuzO5vdJOf7+fAhO3tn5szszL3nnnuKgjHGQBAEQRAEUYtxsrcABEEQBEEQ9oYUIoIgCIIgaj2kEBEEQRAEUeshhYggCIIgiFoPKUQEQRAEQdR6SCEiCIIgCKLWQwoRQRAEQRC1HlKICIIgCIKo9ZBCRBAEQRBErYcUIgekW7duaNmypazHVCgUmDRpkqzHFMPGjRuhUCiQnp5e5ecGgNGjRyM2NtYu5yb0SU9Ph0KhwMaNG+0timzY6ppiY2MxevRoWY/pyOe1Bd26dUO3bt3sLYYsbNq0CU2bNoWrqyv8/f3tLU6NhRQiM/z555+YNGkSWrRoAS8vL0RHR2PYsGH4559/jNp269YNCoUCCoUCTk5O8PX1RZMmTTBixAjs27fPDtIThDT+/vtvzJs3zyoF9uuvv8ayZctkk6mm8uuvv2LevHnIzc21tyiysGrVKodWdm/duoV58+bh9OnT9hZFEufPn8fo0aPRoEEDfPbZZ1i7dq29RaqxuNhbAEfngw8+wNGjRzF06FDExcUhMzMTK1asQJs2bfDbb78ZWXLq1q2LhQsXAgAKCwtx6dIlbN++HV9++SWGDRuGL7/8Eq6urva4FLswYsQIDB8+HG5ubvYWhRDB33//jfnz56Nbt24WW9a+/vprnD17FlOnTtXbHhMTg+Li4lr1/Jvi119/xfz58zF69GijWf+FCxfg5FT181Vrzrtq1SoEBwc7jIVp7969ep9v3bqF+fPnIzY2FgkJCfYRygIOHjwItVqNTz75BA0bNrS3ODUaUojMMH36dHz99ddQKpXabUlJSWjVqhUWLVqEL7/8Uq+9n58fnnvuOb1tixYtwuTJk7Fq1SrExsbigw8+qBLZHQFnZ2c4OzvLdryioiJ4enrKdjyi6lAoFHB3d7e3GCZRq9UoKyuzu5z2mkDUpImLbp9dnblz5w4AmF0qY4yhpKQEHh4eVSBVDYURFtGmTRvWpk0bvW2JiYmsRYsWvO0rKipY8+bNmaenJ8vNzTV5bO44x48fZw8//DBzd3dnsbGx7NNPPzVqW1JSwubMmcMaNGjAlEolq1u3Lnv11VdZSUmJXjsA7OWXX2Y7duxgLVq0YEqlkjVv3pz9+OOPeu3S09PZxIkTWePGjZm7uzsLDAxkTz31FLt69aq2zZ9//skAsI0bNxrJ89NPPzEA7LvvvmOMMbZhwwYGQG9/xhhbuXIla968OVMqlaxOnTrspZdeYvfv3xe8D126dGEeHh5sypQpjDHGdu7cyfr168fq1KnDlEolq1+/PnvnnXdYRUWF3jFGjRrFYmJiTNztymt69NFHWVBQkPZ+jxkzRq+NSqViH3/8MWvevDlzc3NjoaGh7IUXXmA5OTlG7ebOncvq1KnDPDw8WLdu3dhff/3FYmJi2KhRo7TtuHtz5MgR9sorr7Dg4GDm5+fHXnjhBVZaWsru37/PRowYwfz9/Zm/vz979dVXmVqttkimmJgY9vjjj7MjR46w9u3bMzc3N1avXj32xRdfGMlj+O/AgQOi73liYqLR/tz9v3r1KgPANmzYoCdbSkoKe+SRR5inpyfz8/NjAwcOZH///bdem7lz5zIA7OLFi2zUqFHMz8+P+fr6stGjR7PCwkK9tnfv3mXnzp0z2s4H9158+eWXrHnz5szFxYXt2LGDMcbYjRs32JgxY1hoaKj2ffn888/19ue7ptTUVDZq1ChWr1495ubmxsLCwtiYMWNYdna20fUY/uPeE91nRcr7JlZuIYSe0V9++YVNmzaNBQcHM09PT/bEE0+wO3fu6O1neC2JiYna7+/fv8+mTJnC6taty5RKJWvQoAFbtGgRU6lURvdyyZIlbM2aNax+/fpMqVSydu3asT/++ENPztu3b7PRo0ezyMhIplQqWXh4OBs4cKBeP5OYmKiV4cCBA7z3e8OGDWzOnDnMxcVF73o4xo8fz/z8/FhxcTHv/VqyZAkDwNLT042+mzVrFnN1ddW+i//88w8bMmQICwsLY25ubiwyMpIlJSWZHA/47uvcuXO13z3++OPsp59+Ym3btmVubm7s448/Fn2/uXajRo1ivr6+zM/Pj40cOZKdOnXK6JnWvZe68PWvcvZJunJOnTqVxcTEMKVSySIjI9mIESPY3bt32YMHD5inpyebPHmy0X7Xr19nTk5O7P333xe8x7qQQmQBarWaRUZGskcffVRvuymFiDHGFixYwACw77//3uTxExMTWUREBAsNDWWTJk1i//nPf9gjjzzCAOh1bCqVij366KPM09OTTZ06la1Zs4ZNmjSJubi4sEGDBukdEwCLj49nderUYQsWLGDLli1j9evXZ56ennod9datW1l8fDybM2cOW7t2LXvjjTdYQEAAi4mJ0Rtg6tevz/r162ck+5gxY1hAQAArKytjjPErRNxg0KtXL7Z8+XI2adIk5uzszNq3b6/dj7sP4eHhLCQkhL3yyitszZo1bOfOnYwxxp544gk2bNgwtmTJEvbpp5+yoUOHMgBs5syZevKIUYiysrJYQEAAa9y4MVuyZAn77LPP2JtvvsmaNWum127cuHHMxcWFjR8/nq1evZq9/vrrzMvLy0ju1157jQFgAwYMYCtWrGDjx49ndevWZcHBwbyDTUJCAnvsscfYypUr2YgRIxgA9tprr7FHHnmEPfPMM2zVqlWsf//+DIBRZyFWppiYGNakSRMWFhbG3njjDbZixQrWpk0bplAo2NmzZxljjF2+fJlNnjyZAWBvvPEG27RpE9u0aRPLzMwUfc/37t3LEhISWHBwsHZ/TsHgUx727dvHXFxcWOPGjdnixYvZ/PnzWXBwMAsICOB9Zlq3bs2GDBnCVq1axcaNG6e9V7pwbTlFzhQAWLNmzVhISAibP38+W7lyJTt16hTLzMxkdevWZVFRUeydd95hn376KRs4cCADoB10hK7pww8/ZF26dGHvvPMOW7t2LZsyZQrz8PBgHTp00Cq0qamp7Omnn9Yej7tXBQUF2t9L91kR+76JlVsIIYWodevWrEePHmz58uVsxowZzNnZmQ0bNkzbbseOHaxu3bqsadOm2mvZu3cvY4yxwsJCFhcXx4KCgtgbb7zBVq9ezUaOHMkUCoV2gqN7L1u3bs0aNmzIPvjgA7Z48WIWHBzM6tatq/c8d+rUifn5+bG33nqLrVu3jr3//vuse/fu7NChQ9o2uoN4ZmYme+eddxgA9sILL2hlvHz5Mrt48SIDwJYvX653L0pLS1lAQAAbO3as4P26du0aUygUbPHixUbf1a9fnz3++OPaY9WrV49FRESwd999l61bt47Nnz+ftW/fnleZ0r2vgwcPZgDYp59+yjZt2sRSU1O1v1XDhg1ZQEAAmzVrFlu9ejU7cOCA6PutVqtZ165dmZOTE3vppZfY8uXLWY8ePVhcXJxVCpGcfRJjjD148IC1bNmSOTs7s/Hjx7NPP/2ULViwgLVv356dOnWKMcbYs88+y8LCwowmxIsXL2YKhYJdu3ZN8B7rQgqRBWzatMlIOWHMvEK0Y8cOBoB98sknJo/PzbKXLl2q3VZaWsoSEhJYaGio9qHatGkTc3JyYkeOHNHbf/Xq1QwAO3r0qHYbAKZUKtmlS5e021JTU406gqKiIiN5jh07xgCw//73v9pts2fP1pv9cDL6+/vrdSCGCtGdO3eYUqlkjz76qN5sZcWKFQwAW79+vdF9WL16tZFMfHJOmDCBeXp66lnHxChE3O/y559/CrY5cuQIA8C++uorve3cDJ3bnpmZyVxcXNgTTzyh127evHkMAO9g06dPHz3Lz8M
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure()\n",
"for name, g in dfc.groupby(\"seg_2D\"):\n",
" plt.scatter(g[\"frequency\"], g[\"rel_intensity_total\"], s=10, label=name)\n",
"\n",
"plt.yscale(\"log\")\n",
"plt.axvline(thr_freq, linestyle=\"--\")\n",
"plt.axhline(thr_int, linestyle=\"--\")\n",
"plt.xlabel(\"Activity frequency (share of active months)\")\n",
"plt.ylabel(\"Gross flow / mean AUM (quantity) [log scale]\")\n",
"plt.title(\"2D behavioral segmentation: relative intensity vs frequency\")\n",
"plt.legend(markerscale=2)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "d05ac724-6aab-4ea9-bce5-d1b0252148a5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_clients</th>\n",
" <th>aum_qty_med</th>\n",
" <th>gross_flow_qty_med</th>\n",
" <th>freq_med</th>\n",
" <th>rel_int_med</th>\n",
" <th>n_tx_med</th>\n",
" </tr>\n",
" <tr>\n",
" <th>seg_2D</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Highly active (high int, high freq)</th>\n",
" <td>3624</td>\n",
" <td>823.825696</td>\n",
" <td>105.945544</td>\n",
" <td>0.157379</td>\n",
" <td>5.991354</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dormant (low int, low freq)</th>\n",
" <td>3588</td>\n",
" <td>102.161465</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Small rebalancers (low int, high freq)</th>\n",
" <td>1283</td>\n",
" <td>543.424242</td>\n",
" <td>12.149239</td>\n",
" <td>0.100000</td>\n",
" <td>0.911442</td>\n",
" <td>15.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Occasional large movers (high int, low freq)</th>\n",
" <td>1248</td>\n",
" <td>39.278192</td>\n",
" <td>4.349875</td>\n",
" <td>0.025000</td>\n",
" <td>4.497671</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_clients aum_qty_med \\\n",
"seg_2D \n",
"Highly active (high int, high freq) 3624 823.825696 \n",
"Dormant (low int, low freq) 3588 102.161465 \n",
"Small rebalancers (low int, high freq) 1283 543.424242 \n",
"Occasional large movers (high int, low freq) 1248 39.278192 \n",
"\n",
" gross_flow_qty_med freq_med \\\n",
"seg_2D \n",
"Highly active (high int, high freq) 105.945544 0.157379 \n",
"Dormant (low int, low freq) 0.000000 0.000000 \n",
"Small rebalancers (low int, high freq) 12.149239 0.100000 \n",
"Occasional large movers (high int, low freq) 4.349875 0.025000 \n",
"\n",
" rel_int_med n_tx_med \n",
"seg_2D \n",
"Highly active (high int, high freq) 5.991354 17.0 \n",
"Dormant (low int, low freq) 0.000000 0.0 \n",
"Small rebalancers (low int, high freq) 0.911442 15.0 \n",
"Occasional large movers (high int, low freq) 4.497671 1.0 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profile_2d = (\n",
" dfc.groupby(\"seg_2D\")\n",
" .agg(\n",
" n_clients=(ID_COL, \"count\"),\n",
" aum_qty_med=(\"aum_qty_mean\",\"median\"),\n",
" gross_flow_qty_med=(\"gross_flow_qty_mean\",\"median\"),\n",
" freq_med=(\"frequency\",\"median\"),\n",
" rel_int_med=(\"rel_intensity_total\",\"median\"),\n",
" n_tx_med=(\"n_tx_total\",\"median\"),\n",
" )\n",
" .sort_values(\"n_clients\", ascending=False)\n",
")\n",
"profile_2d\n"
]
},
{
"cell_type": "markdown",
"id": "4ceeeec6-4f52-4b8b-9c14-2419181d0a5f",
"metadata": {},
"source": [
"## Clustering 2 :KMEANS"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6afd24fe-0886-445c-a330-6a537d6aaa1a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Clustering matrix shape: (9743, 6)\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"from sklearn.mixture import GaussianMixture\n",
"\n",
"# Safety: ensure baseline-2 columns exist\n",
"dfc = dfc.copy()\n",
"dfc[\"frequency\"] = dfc[\"flow_freq\"]\n",
"dfc[\"rel_intensity_total\"] = dfc[\"gross_flow_to_aum\"]\n",
"\n",
"# Choose a compact, interpretable feature set (quantity-based)\n",
"features = [\n",
" \"log_aum_qty_mean\", # size (log)\n",
" \"log_gross_flow_qty_mean\", # activity intensity (log)\n",
" \"frequency\", # activity frequency\n",
" \"rel_intensity_total\", # turnover proxy\n",
" \"net_flow_qty_vol\", # volatility of net flows\n",
" \"n_tx_total\", # total number of transactions\n",
"]\n",
"\n",
"# Build X (drop NaNs/Infs)\n",
"X = (dfc[features]\n",
" .replace([np.inf, -np.inf], np.nan)\n",
" .dropna()\n",
" .copy())\n",
"\n",
"# Keep IDs aligned\n",
"ids = dfc.loc[X.index, ID_COL].copy()\n",
"\n",
"# Standardize (critical for distance-based clustering)\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"print(\"Clustering matrix shape:\", X_scaled.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "d2e58d79-0638-4cfa-9149-f3bfe5f89399",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAeQJJREFUeJzt3Xd8U1X/B/BPkjbpTPektBTKKi1lSSlbqBQsKOJARGQIihRliVAVEHy0gAsHgujzAwSR9QDKtpYNZRUKlD3K7gBK00Fncn5/1EZCCySQcjs+79crL8m9JzefmyL59txzz5EJIQSIiIiI6IHkUgcgIiIiqgpYNBEREREZgUUTERERkRFYNBEREREZgUUTERERkRFYNBEREREZgUUTERERkRFYNBEREREZgUUTERERkRFYNBFVYTKZDJ988on++SeffAKZTIabN29KF6oGycnJwdChQ+Hp6QmZTIbRo0dLHYmIKhCLJqJKZsGCBZDJZPd97N27V+qI9I/PP/8cCxYswDvvvINFixZhwIABFfp+derUQc+ePctsX7RoERQKBbp37478/PwKzUBUk1lIHYCIyjdt2jT4+/uX2R4QECBBGirPli1b0KZNG0yZMkWyDL/99hsGDRqE8PBwrFmzBlZWVpJlIaruWDQRVVI9evRAq1atpI5RpeTm5sLW1vaJvV96ejoCAwPNdrzi4mLodDoolUqj2i9duhQDBw5Ely5d8Mcff7BgIqpgvDxHVA3dvHkTr7zyCtRqNVxcXDBq1Kgyl22Ki4vx6aefol69elCpVKhTpw4+/PBDFBQU6NuMHTsWLi4uEELot7377ruQyWT47rvv9NvS0tIgk8kwZ86ch2ZbvHgxWrduDRsbGzg5OaFjx47466+/9PvvHadVqk6dOhg0aJD+eellzO3bt2PEiBFwd3eHj48PVq5cqd9+r59++gkymQxJSUn6badOncJLL70EZ2dnWFlZoVWrVvjzzz8feA7btm2DTCZDcnIy1q9fr790evHiRQAlxdSbb74JDw8PWFlZISQkBAsXLjQ4xsWLFyGTyfDll19i1qxZ+p/DiRMnHvoZAsDy5cvx+uuvo3Pnzvjzzz8fWjDJZDKMHDkSK1asQGBgIKytrREWFoZjx47pP5uAgABYWVmhc+fO+nO52759+9C9e3c4ODjAxsYGnTp1wu7duw3aXLp0CSNGjEDDhg1hbW0NFxcXvPzyy2WOV/rz2717N8aOHQs3NzfY2trihRdewI0bNwzaHjx4EBEREXB1dYW1tTX8/f0xZMgQoz4nInNiTxNRJaXRaMoM6JbJZHBxcXnoa1955RXUqVMHMTEx2Lt3L7777jvcvn0bv/76q77N0KFDsXDhQrz00ksYN24c9u3bh5iYGJw8eRKrV68GAHTo0AHffPMNjh8/jqCgIADAzp07IZfLsXPnTrz33nv6bQDQsWPHB+aaOnUqPvnkE7Rt2xbTpk2DUqnEvn37sGXLFnTr1s34D+cuI0aMgJubGyZPnozc3FxERkbCzs4Oy5cvR6dOnQzaLlu2DE2aNNGfy/Hjx9GuXTvUqlULEydOhK2tLZYvX47evXvjf//7H1544YVy37Nx48ZYtGgRxowZAx8fH4wbNw4A4Obmhry8PHTu3Bnnzp3DyJEj4e/vjxUrVmDQoEHIzMzEqFGjDI41f/585Ofn46233oJKpYKzs/NDz/l///sf+vfvj44dO2Lt2rWwtrY26rPauXMn/vzzT0RFRQEAYmJi0LNnT3zwwQf48ccfMWLECNy+fRszZ87EkCFDsGXLFv1rt2zZgh49eqBly5aYMmUK5HI55s+fjy5dumDnzp1o3bo1AODAgQPYs2cPXn31Vfj4+ODixYuYM2cOOnfujBMnTsDGxsYg07vvvgsnJydMmTIFFy9exKxZszBy5EgsW7YMQEkB2q1bN7i5uWHixIlwdHTExYsXsWrVKqPOmcisBBFVKvPnzxcAyn2oVCqDtgDElClT9M+nTJkiAIjnnnvOoN2IESMEAHHkyBEhhBCJiYkCgBg6dKhBu/fff18AEFu2bBFCCJGeni4AiB9//FEIIURmZqaQy+Xi5ZdfFh4eHvrXvffee8LZ2VnodLr7ntfZs2eFXC4XL7zwgtBqtQb77n7dvedUys/PTwwcOLDM59S+fXtRXFxs0LZfv37C3d3dYHtKSoqQy+Vi2rRp+m1du3YVwcHBIj8/3yBL27ZtRf369e97LndnioyMNNg2a9YsAUAsXrxYv62wsFCEhYUJOzs7kZWVJYQQIjk5WQAQarVapKenP/S9St/P29tbWFhYiM6dO4vc3FyjXieE0P/9SU5O1m/76aefBADh6empzyWEENHR0QKAvq1OpxP169cXERERBj+rO3fuCH9/f/HMM88YbLtXfHy8ACB+/fVX/bbSn194eLjBMceMGSMUCoXIzMwUQgixevVqAUAcOHDA6HMlqii8PEdUSc2ePRuxsbEGj40bNxr12tKehFLvvvsuAGDDhg0G/x07dqxBu9Iek/Xr1wMo6Tlp1KgRduzYAQDYvXs3FAoFxo8fj7S0NJw9exZASQ9G+/btIZPJ7ptpzZo10Ol0mDx5MuRyw396HvS6hxk2bBgUCoXBtr59+yI9PR3btm3Tb1u5ciV0Oh369u0LAMjIyMCWLVvwyiuvIDs7Gzdv3sTNmzdx69YtRERE4OzZs7h27ZrJeTZs2ABPT0/069dPv83S0hLvvfcecnJyylw2fPHFF+Hm5mb08TMyMlBcXAwfHx+je5hKde3aFXXq1NE/Dw0N1Wewt7cvs/3ChQsAgMTERJw9exavvfYabt26pf+scnNz0bVrV+zYsQM6nQ4ADDIVFRXh1q1bCAgIgKOjIw4dOlQm01tvvWXw8+/QoQO0Wi0uXboEAHB0dAQArFu3DkVFRSadL5G58fIcUSXVunXrRx4IXr9+fYPn9erVg1wu148ruXTpEuRyeZk78Tw9PeHo6Kj/wgJKvsRKi6ydO3eiVatWaNWqFZydnbFz5054eHjgyJEjeO211x6Y6fz585DL5WYdOA2g3DsMS8fdLFu2DF27dgVQcmmuWbNmaNCgAQDg3LlzEEJg0qRJmDRpUrnHTk9PR61atUzKc+nSJdSvX79MYdi4cWP9/oflf5CuXbvC19cXc+bMgbOzM7799lv9Po1Gg7y8PP1zpVJpcLnP19fX4FgODg4AgNq1a5e7/fbt2wCgL44HDhx431wajQZOTk7Iy8tDTEwM5s+fj2vXrhmMh9NoNGVed28mJycng/fu1KkTXnzxRUydOhXffPMNOnfujN69e+O1116DSqW6bx6iisCiiagGuF9PjjE9PO3bt8fPP/+MCxcuYOfOnejQoQNkMhnat2+PnTt3wtvbGzqdDh06dDB3bANarbbc7eX1tqhUKvTu3RurV6/Gjz/+iLS0NOzevRuff/65vk1pz8j777+PiIiIco/9JKZ3MLW3CAB++OEH3L59G9999x2cnJz0A+dHjRplMOC8U6dOBr1t9/bIPWx7acFT+ll98cUXaNasWblt7ezsAJT0as6fPx+jR49GWFgYHBwcIJPJ8Oqrr+qPY8p7y2QyrFy5Env37sXatWuxefNmDBkyBF999RX27t2rf1+iJ4FFE1E1dPbsWYMejHPnzkGn0+kvzfj5+UGn0+Hs2bP6HhCg5C64zMxM+Pn56beVFkOxsbE4cOAAJk6cCKBk0PecOXPg7e0NW1tbtGzZ8oGZ6tWrB51OhxMnTtz3ixco6WnIzMw02FZYWIiUlBRjTl2vb9++WLhwIeLi4nDy5EkIIfSX5gCgbt26AEounYWHh5t07Afx8/PD0aNHodPpDHqbTp06pd//uORyOX799VdoNBpMnToVzs7OeO+99/DBBx/g9ddf17cr7bV5XPXq1QMAqNXqh35WK1euxMCBA/HVV1/pt+Xn55f5mZqqTZs2aNOmDT777DMsWbIE/fv3x9KlSzF06NDHOi6RKTimiagamj17tsHz77//HkDJ3E8A8OyzzwIAZs2aZdDu66+/BgBERkbqt/n7+6NWrVr45ptvUFRUhHb
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAZodJREFUeJzt3XlcVOX+B/DPzMDMsA4iMCwiq4ooriiiuYIiLmlZmtdyK+ualYZmWlfNrFC79dO6lmm5lNdyy9KuS4ZLabihKO6KqKjs+yLbzPn9gUxOgDI4cBj4vF+vedk8Z5nvQYKP53me80gEQRBARERE1IRIxS6AiIiIqL4xABEREVGTwwBERERETQ4DEBERETU5DEBERETU5DAAERERUZPDAERERERNDgMQERERNTkMQERERNTkMAARGYmnpycmTpyoe3/w4EFIJBIcPHhQ19avXz+0b9++/oujevPxxx/D29sbMpkMnTp1ErscIqoGAxDRI8TFxeGZZ56Bh4cHlEol3NzcMHDgQHz++edil1YnLly4gPfeew83btyotO2LL77AunXr6r0mU/Hrr79i9uzZ6NWrF9auXYuPPvqoTj9v4sSJsLa2rtR+9uxZODg4wNPTs8q/RyICzMQugKgh+/PPP9G/f3+0bNkSU6ZMgbOzMxITE3H06FEsX74cr7/+um7fy5cvQyo1/X9TXLhwAQsXLkS/fv3g6empt+2LL76Ag4OD3p0u+sv+/fshlUrxzTffQC6Xi1LDuXPnEBISAisrKxw4cKDS3yERlWMAInqIDz/8ECqVCidOnICdnZ3ettTUVL33CoWiHisjYxIEAUVFRbCwsHis86SmpsLCwsJo4cfQus6fP48BAwbAwsICBw4cgJeXl1HqIGqMTP+fq0R1KD4+Hu3atasUfgDAyclJ7/3fxwA9zIULF9C/f39YWlrCzc0NS5curbRPamoqXnzxRajVaiiVSnTs2BHr16/X26eqcUYAcOPGDUgkkkrdVZcuXcIzzzwDe3t7KJVKBAYGYseOHbrt69atw7PPPgsA6N+/PyQSie78np6eOH/+PA4dOqRr79evn+7Y7OxszJgxA+7u7lAoFPD19cWSJUug1Wof+fU4efIkwsLC4ODgAAsLC3h5eWHy5Ml6+2i1WixfvhwBAQFQKpVwdHTE4MGDcfLkSd0+ZWVlWLRoEXx8fKBQKODp6Yl33nkHxcXFeufy9PTEsGHDsHfvXgQGBsLCwgJfffXVY12HRCLB2rVrUVBQoPv6VHz9jVHXo1y8eBEhISFQKBQ4cOAAvL29H7p/xXi0s2fPom/fvrC0tISvry+2bt0KADh06BCCgoJgYWGBNm3a4Lfffqt0jjt37mDy5MlQq9VQKBRo164d1qxZo7dPSUkJ5s+fj65du0KlUsHKygq9e/fGgQMH9Par+J7997//jVWrVum+Vt26dcOJEyf09k1OTsakSZPQokULKBQKuLi4YMSIEezuI4PwDhDRQ3h4eCA6Ohrnzp0z2uDlrKwsDB48GE8//TRGjx6NrVu34u2330ZAQADCw8MBAPfu3UO/fv1w7do1vPbaa/Dy8sKWLVswceJEZGdnY/r06QZ/7vnz59GrVy+4ublhzpw5sLKywubNmzFy5Ehs27YNTz31FPr06YM33ngDn332Gd555x20bdsWANC2bVssW7YMr7/+OqytrfHuu+8CANRqNQCgsLAQffv2xZ07d/DKK6+gZcuW+PPPPzF37lwkJSVh2bJl1daVmpqKQYMGwdHREXPmzIGdnR1u3LiBH3/8UW+/F198EevWrUN4eDheeukllJWV4Y8//sDRo0cRGBgIAHjppZewfv16PPPMM5g5cyaOHTuGyMhIXLx4Edu3b9c73+XLlzF27Fi88sormDJlCtq0afNY1/Hdd99h1apVOH78OL7++msAQM+ePY1S16NcvnwZAwYMgJmZGQ4cOAAfH59HHgOUfy8OGzYMzz33HJ599ll8+eWXeO655/Df//4XM2bMwD//+U/84x//wMcff4xnnnkGiYmJsLGxAQCkpKSgR48ekEgkeO211+Do6Ijdu3fjxRdfRG5uLmbMmAEAyM3Nxddff42xY8diypQpyMvLwzfffIOwsDAcP3680kDxjRs3Ii8vD6+88gokEgmWLl2Kp59+GtevX4e5uTkAYNSoUTh//jxef/11eHp6IjU1Ffv27cOtW7fY5Uc1JxBRtX799VdBJpMJMplMCA4OFmbPni3s3btXKCkpqbSvh4eHMGHCBN37AwcOCACEAwcO6Nr69u0rABC+/fZbXVtxcbHg7OwsjBo1Ste2bNkyAYCwYcMGXVtJSYkQHBwsWFtbC7m5udV+hiAIQkJCggBAWLt2ra4tJCRECAgIEIqKinRtWq1W6Nmzp9CqVStd25YtW6o8pyAIQrt27YS+fftWal+0aJFgZWUlXLlyRa99zpw5gkwmE27dulXpmArbt28XAAgnTpyodp/9+/cLAIQ33nij0jatVisIgiDExsYKAISXXnpJb/usWbMEAML+/ft1bR4eHgIAYc+ePUa7DkEQhAkTJghWVlZ6bcao62GfZ25uLri4uAiurq6V6n6Yiu/FjRs36touXbokABCkUqlw9OhRXfvevXsrfT+9+OKLgouLi5Cenq533ueee05QqVRCYWGhIAiCUFZWJhQXF+vtk5WVJajVamHy5Mm6torv2ebNmwuZmZm69p9//lkAIOzcuVN3LADh448/rvG1ElWFXWBEDzFw4EBER0fjySefxJkzZ7B06VKEhYXBzc1Nr+vIENbW1nj++ed17+VyObp3747r16/r2nbt2gVnZ2eMHTtW12Zubo433ngD+fn5OHTokEGfmZmZif3792P06NHIy8tDeno60tPTkZGRgbCwMFy9ehV37typ1fUAwJYtW9C7d280a9ZMd+709HSEhoZCo9Hg999/r/bYiu7FX375BaWlpVXus23bNkgkEixYsKDSNolEAqD8awYAERERettnzpwJAPjf//6n1+7l5YWwsDCjXUd1jFHXw2g0GqSnp8Pe3h4ODg4G1WZtbY3nnntO975Nmzaws7ND27ZtERQUpGuv+O+K71FBELBt2zYMHz4cgiDofa3CwsKQk5ODU6dOAQBkMpluTJRWq0VmZibKysoQGBio2+dBY8aMQbNmzXTve/furffZFWOsDh48iKysLIOul+hB7AIjeoRu3brhxx9/RElJCc6cOYPt27fj//7v//DMM88gNjYW/v7+Bp2vRYsWul/aFZo1a4azZ8/q3t+8eROtWrWqNKusokvq5s2bBn3mtWvXIAgC5s2bh3nz5lW5T2pqKtzc3Aw6b4WrV6/i7NmzcHR0rPbc1enbty9GjRqFhQsX4v/+7//Qr18/jBw5Ev/4xz90A8vj4+Ph6uoKe3v7as9z8+ZNSKVS+Pr66rU7OzvDzs6u0tesqgHCj3MddVnXw1hYWODrr7/GuHHjMHToUOzbtw9WVlYAyrtSc3JyKn1uhaq+F1UqFdzd3Su1AdAFjrS0NGRnZ2PVqlVYtWpVlXU9+LVav349PvnkE1y6dEkv5FZ1rS1bttR7XxGGKj5boVBgyZIlmDlzJtRqNXr06IFhw4Zh/PjxetdG9CgMQEQ1JJfL0a1bN3Tr1g2tW7fGpEmTsGXLlirvSjyMTCarsl0QBINr+vsvrwoajUbvfcUA3lmzZlV7d+Hvv6ANodVqMXDgQMyePbvK7a1bt672WIlEgq1bt+Lo0aPYuXMn9u7di8mTJ+OTTz7B0aNHq3zOzcNU9zX5u6pmVj3OddRlXY/y3HPPISsrC6+++iqefvpp7Ny5E3K5HJs2bcKkSZP09n3w+6y678VHfY9WfD89//zzmDBhQpX7dujQAQCwYcMGTJw4ESNHjsRbb70FJycnyGQyREZGIj4+3uDPBoAZM2Zg+PDh+Omnn7B3717MmzcPkZGR2L9/Pzp37lzl8UR/xwBEVAsVg26TkpLq5PweHh44e/YstFqt3l2gS5cu6bYDf/3rODs7W+/4v99VqJgRZG5ujtDQ0Id+9sN+UVe3zcfHB/n5+Y8898P
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best K by silhouette: 3\n"
]
}
],
"source": [
"k_range = range(2, 11)\n",
"inertias = []\n",
"silhouettes = []\n",
"\n",
"for k in k_range:\n",
" km = KMeans(n_clusters=k, n_init=30, random_state=42)\n",
" labels = km.fit_predict(X_scaled)\n",
" inertias.append(km.inertia_)\n",
" silhouettes.append(silhouette_score(X_scaled, labels))\n",
"\n",
"# Elbow plot\n",
"plt.figure()\n",
"plt.plot(list(k_range), inertias, marker=\"o\")\n",
"plt.xlabel(\"Number of clusters K\")\n",
"plt.ylabel(\"Inertia (within-cluster SSE)\")\n",
"plt.title(\"Elbow curve for K-means\")\n",
"plt.show()\n",
"\n",
"# Silhouette plot\n",
"plt.figure()\n",
"plt.plot(list(k_range), silhouettes, marker=\"o\")\n",
"plt.xlabel(\"Number of clusters K\")\n",
"plt.ylabel(\"Silhouette score\")\n",
"plt.title(\"Silhouette score for K-means\")\n",
"plt.show()\n",
"\n",
"best_k = list(k_range)[int(np.argmax(silhouettes))]\n",
"print(\"Best K by silhouette:\", best_k)\n",
"\n",
"\n",
"''' Ce que cest :\n",
"Inertia = somme des distances intra-cluster (SSE).\n",
"Plus elle baisse, plus les clusters sont “serrés”.\n",
"\n",
"Comment lire :\n",
"Quand K augmente, inertia baisse toujours (normal).\n",
"On cherche un “coude” : à partir dun certain K, ajouter des clusters apporte peu\n",
"'''"
]
},
{
"cell_type": "markdown",
"id": "ddfb03e9-a9ff-463d-b653-f752773aacd9",
"metadata": {},
"source": [
"Valeurs Silhouette:\n",
"proche de 1 : très bon\n",
"0 : clusters se chevauchent\n",
"négatif : mauvais clustering"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ea340dca-a76a-4d50-8973-af1dca34c376",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_clients</th>\n",
" <th>aum_qty_med</th>\n",
" <th>freq_med</th>\n",
" <th>rel_int_med</th>\n",
" <th>gross_flow_med</th>\n",
" <th>n_tx_med</th>\n",
" <th>vol_med</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster_kmeans</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0.0</th>\n",
" <td>7718</td>\n",
" <td>104.983133</td>\n",
" <td>0.024390</td>\n",
" <td>1.039412e+00</td>\n",
" <td>0.774805</td>\n",
" <td>1.0</td>\n",
" <td>5.080794</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1.0</th>\n",
" <td>2024</td>\n",
" <td>14064.797371</td>\n",
" <td>0.575379</td>\n",
" <td>4.591024e+00</td>\n",
" <td>1100.212569</td>\n",
" <td>198.5</td>\n",
" <td>2291.806790</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2.0</th>\n",
" <td>1</td>\n",
" <td>0.000079</td>\n",
" <td>0.014706</td>\n",
" <td>1.257232e+06</td>\n",
" <td>1.468240</td>\n",
" <td>1.0</td>\n",
" <td>12.107415</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_clients aum_qty_med freq_med rel_int_med \\\n",
"cluster_kmeans \n",
"0.0 7718 104.983133 0.024390 1.039412e+00 \n",
"1.0 2024 14064.797371 0.575379 4.591024e+00 \n",
"2.0 1 0.000079 0.014706 1.257232e+06 \n",
"\n",
" gross_flow_med n_tx_med vol_med \n",
"cluster_kmeans \n",
"0.0 0.774805 1.0 5.080794 \n",
"1.0 1100.212569 198.5 2291.806790 \n",
"2.0 1.468240 1.0 12.107415 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"km = KMeans(n_clusters=best_k, n_init=50, random_state=42)\n",
"labels_km = km.fit_predict(X_scaled)\n",
"\n",
"dfc.loc[X.index, \"cluster_kmeans\"] = labels_km\n",
"\n",
"# Profiling table (medians = robust to outliers)\n",
"k_profile = (\n",
" dfc.loc[X.index]\n",
" .groupby(\"cluster_kmeans\")\n",
" .agg(\n",
" n_clients=(ID_COL, \"count\"),\n",
" aum_qty_med=(\"aum_qty_mean\", \"median\"),\n",
" freq_med=(\"frequency\", \"median\"),\n",
" rel_int_med=(\"rel_intensity_total\", \"median\"),\n",
" gross_flow_med=(\"gross_flow_qty_mean\", \"median\"),\n",
" n_tx_med=(\"n_tx_total\", \"median\"),\n",
" vol_med=(\"net_flow_qty_vol\", \"median\"),\n",
" )\n",
" .sort_values(\"n_clients\", ascending=False)\n",
")\n",
"\n",
"k_profile\n"
]
},
{
"cell_type": "markdown",
"id": "7702b855-8c02-486e-9c28-0e8c1b872a73",
"metadata": {},
"source": [
"## PARTIE 3 — GMM (Gaussian Mixture Model)"
]
},
{
"cell_type": "markdown",
"id": "810e73ed-f479-4f76-806c-3c18d281a931",
"metadata": {},
"source": [
"BIC/AIC = critères pour choisir le nombre de composantes (clusters) dans un modèle probabiliste.\n",
"Plus bas = meilleur compromis “fit” vs “complexité”."
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "ba9df953-cc88-4907-9fe7-b88978f318b3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAjfBJREFUeJzs3Xd4FFXbx/Hv7G56D6RSQw1VmkDoSgldRAUBBRHLg1gQHhFsqK9Ks4AF7IAIIqgUQUCkCoRO6N3QSSghnbTd8/4Rsw9LEkhgk0m5P9c1F9mZMzO/2SzszcyZM5pSSiGEEEIIIYqUQe8AQgghhBBlkRRhQgghhBA6kCJMCCGEEEIHUoQJIYQQQuhAijAhhBBCCB1IESaEEEIIoQMpwoQQQgghdCBFmBBCCCGEDqQIE0IIIYTQgRRhQoi7sn79ejRNY/369QVed9asWWiaxqlTp+yeKy9vv/02mqYV2f5upGkab7/9ti77Bnjuuefo3Lmz9fWpU6fQNI1Zs2bplsle7PlZqlq1Kj179rz7UDrLyMigUqVKTJ8+Xe8oIg9ShAlxg6ioKJ5//nlq1aqFq6srrq6u1K1blxEjRrBv3z6bttlf5gaDgbNnz+bYVkJCAi4uLmiaxvPPP2+dn/3Fp2ka7733Xq45Bg0ahKZpuLu72/cARaH7448/dC208hIVFcW3337La6+9pneUMiclJYW333471/+oFObnxcHBgVGjRvH++++TmppaKPsQd0eKMCH+tWzZMurXr8+cOXPo1KkTn3zyCdOmTaNbt2788ccfNGrUiNOnT+dYz8nJiZ9++inH/N9+++2W+3N2ds51veTkZJYsWYKzs/OdH4zQzR9//ME777yT67Lr16/zxhtvFHGiLNOmTSMkJIT77rtPl/0Xtscff5zr169TpUoVvaPkkJKSwjvvvJNnEZbX58Uehg4dypUrV5g3b16h7UPcOSnChABOnjzJo48+SpUqVThy5AjTp0/n2Wef5emnn+ajjz7i+PHjTJ06FYMh51+Z7t2751pMzZs3jx49euS5z+7du3Po0CH27t1rM3/JkiWkp6fbXDYSpYOzszMmk6nI95uRkcHcuXPp169fke/bnpKTk/NcZjQacXZ21u1Sc3GilOL69esAeHt706VLl1Jxybk0kiJMCGDy5MkkJyczc+ZMgoKCciw3mUy8+OKLVKpUKceygQMHEhkZyZEjR6zzoqOjWbt2LQMHDsxzn2FhYYSEhOT4H+rcuXPp2rUrvr6++cr+xBNP4O7uzpkzZ+jZsyfu7u5UqFCBL774AoD9+/dz//334+bmRpUqVXL9H/E///zDI488gq+vL66urrRs2ZLly5fnaHfu3Dn69OmDm5sb/v7+vPzyy6SlpeWaa9u2bXTt2hUvLy9cXV1p3749mzdvztcx3Sw6OpqhQ4dSsWJFnJycCAoK4oEHHsjR/2fFihW0bdsWNzc3PDw86NGjBwcPHszXPn788UeaNm2Ki4sLvr6+PProo7leZt62bRvdu3fHx8cHNzc3GjZsyLRp04Cs30X2+559yfnGoiC3PmF79uyhW7dueHp64u7uTseOHdm6datNm+z+Tps3b2bUqFH4+fnh5ubGgw8+yOXLl297bJs2beLKlSt06tQpX+/F2rVrre+jt7c3DzzwAIcPH7Yu37dvH5qmsXTpUuu8Xbt2oWkaTZo0sdlWt27daNGihc28/Pyesj/XJ0+epHv37nh4eDBo0KA8M+fWJ2znzp2Eh4dTvnx5XFxcCAkJ4cknn8zXewDw559/0qhRI5ydnalbt26uZ7fj4uIYOXIklSpVwsnJiRo1ajBp0iQsFguQ1f3Az88PgHfeecf6mXj77bdv+3mxWCxMnTqVevXq4ezsTEBAAM8++yzXrl2zyZDdh23VqlU0a9YMFxcXvvrqK+vyzp07s2nTJmJjY/N97KJoSBEmBFmXImvUqJHjyyI/2rVrR8WKFW2Km59//hl3d/dbngkDGDBgAPPnz0cpBcCVK1f4888/b1m85cZsNtOtWzcqVarE5MmTqVq1Ks8//zyzZs2ia9euNGvWjEmTJuHh4cHgwYOJioqyrhsTE0OrVq1YtWoVzz33nLX/SO/evVm0aJG13fXr1+nYsSOrVq3i+eef5/XXX+fvv/9mzJgxOfKsXbuWdu3akZCQwPjx4/nggw+Ii4vj/vvvZ/v27QU6NoCHHnqIRYsWMXToUKZPn86LL75IYmIiZ86csbaZM2cOPXr0wN3dnUmTJvHmm29y6NAh2rRpc9vO2u+//z6DBw+mZs2afPzxx4wcOZI1a9bQrl074uLirO1Wr15Nu3btOHToEC+99BIfffQR9913H8uWLQPg2WeftZ7BnDNnjnXKy8GDB2nbti179+5lzJgxvPnmm0RFRdGhQwe2bduWo/0LL7zA3r17GT9+PMOHD+f333+36W+Yly1btqBpGo0bN75t27/++ovw8HAuXbrE22+/zahRo9iyZQutW7e2vo/169fH29ubjRs3Wtf7+++/MRgM7N27l4SEBCCriNiyZQvt2rWztivI7ykzM5Pw8HD8/f358MMPeeihh26bP9ulS5fo0qULp06dYuzYsXz22WcMGjQoR4Gbl+PHj9O/f3+6devGhAkTMJlMPPLII6xevdraJiUlhfbt2/Pjjz8yePBgPv30U1q3bs24ceMYNWoUAH5+fsyYMQOABx980PqZ6Nu3720/L88++yyvvPIKrVu3Ztq0aQwdOpS5c+cSHh5ORkaGTd6jR48yYMAAOnfuzLRp02jUqJF1WdOmTVFKsWXLlny/f6KIKCHKuPj4eAWoPn365Fh27do1dfnyZeuUkpJiXTZ+/HgFqMuXL6v//ve/qkaNGtZl9957rxo6dKhSSilAjRgxwrosKipKAWrKlCnqwIEDClB///23UkqpL774Qrm7u6vk5GQ1ZMgQ5ebmdtv8Q4YMUYD64IMPbHK7uLgoTdPU/PnzrfOPHDmiADV+/HjrvJEjR9pkUEqpxMREFRISoqpWrarMZrNSSqmpU6cqQC1YsMDaLjk5WdWoUUMBat26dUoppSwWi6pZs6YKDw9XFovF2jYlJUWFhISozp07W+fNnDlTASoqKirP47t27Zr1/cpLYmKi8vb2Vk8//bTN/OjoaOXl5WUzP/v3lu3UqVPKaDSq999/32bd/fv3K5PJZJ2fmZmpQkJCVJUqVdS1a9ds2t54nCNGjFB5/dN683vfp08f5ejoqE6ePGmdd+HCBeXh4aHatWtnnZf9PnXq1MlmXy+//LIyGo0qLi4ur7dGKaXUY489psqVK5djfvZncebMmdZ5jRo1Uv7+/urq1avWeXv37lUGg0ENHjzYOq9Hjx6qefPm1td9+/ZVffv2VUajUa1YsUIppdTu3bsVoJYsWaKUKtjvKftzPXbs2FseW7abP0uLFi1SgNqxY0e+1r9RlSpVFKB+/fVX67z4+HgVFBSkGjdubJ33f//3f8rNzU0dO3bMZv2xY8cqo9Gozpw5o5RS6vLlyzl+99ny+rz8/fffClBz5861mb9y5coc87Pzrly5MtfjuXDhggLUpEmTbn/wokjJmTBR5mX/rz23OxE7dOiAn5+fdcq+dHCzgQMHcuLECXbs2GH9Mz9ns+rVq0fDhg2tfcrmzZvHAw88gKura4GP46mnnrL+7O3tTe3atXFzc7PpB1S7dm28vb35559/rPP++OMPmjdvTps2bazz3N3deeaZZzh16hSHDh2ytgsKCuLhhx+2tnN1deWZZ56xyREZGcnx48cZOHAgV69e5cqVK1y5coXk5GQ6duzIxo0brZdq8sPFxQVHR0fWr1+f4zJMttWrVxMXF8eAAQOs+7ty5QpGo5EWLVqwbt26PLf/22+/YbFY6Nevn826gYGB1KxZ07runj17iIqKYuTIkXh7e9ts4076IZnNZv7880/69OlDtWrVrPODgoIYOHAgmzZtsn42sz3zzDM2+2rbti1msznXG0ZudPXqVXx8fG6b6eLFi0RGRvLEE0/YXA5v2LAhnTt35o8//rDZ9+7du639tDZt2kT37t1p1KgRf//
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best K by BIC: 10\n"
]
}
],
"source": [
"bics, aics = [], []\n",
"models = {}\n",
"\n",
"for k in k_range:\n",
" gmm = GaussianMixture(n_components=k, covariance_type=\"full\", random_state=42)\n",
" gmm.fit(X_scaled)\n",
" bics.append(gmm.bic(X_scaled))\n",
" aics.append(gmm.aic(X_scaled))\n",
" models[k] = gmm\n",
"\n",
"plt.figure()\n",
"plt.plot(list(k_range), bics, marker=\"o\", label=\"BIC\")\n",
"plt.plot(list(k_range), aics, marker=\"o\", label=\"AIC\")\n",
"plt.xlabel(\"Number of components K\")\n",
"plt.ylabel(\"Criterion value\")\n",
"plt.title(\"GMM model selection (lower is better)\")\n",
"plt.legend()\n",
"plt.show()\n",
"\n",
"best_k_gmm = list(k_range)[int(np.argmin(bics))]\n",
"print(\"Best K by BIC:\", best_k_gmm)\n"
]
},
{
"cell_type": "markdown",
"id": "ce9b76de-2191-4528-8913-c2bb1911e97c",
"metadata": {},
"source": [
"Graphe :\n",
"- BIC/AIC continuent de baisser jusquà K=10. ==> Donc “Best K by BIC = 10”.\n",
"\n",
"Attention importante :\n",
"Quand BIC baisse toujours avec K, ça peut signifier :\n",
"- les données ont beaucoup de structure fine / multi-groupes\n",
"- OU le modèle “découpe” surtout des outliers / sous-groupes très rares\n",
"- OU le modèle est en train de sur-segmenter des extrêmes (typique finance)\n",
"Donc K=10 nest pas automatiquement “mieux” au sens business, cest “mieux” pour le fit probabiliste.\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "47e0a0bd-0a26-4e56-91cb-5780a3c1fb86",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAUNNJREFUeJzt3XlYVGX/P/D3sA0IzuACDCgCaiqYS6LiuC8kKlY+akpuuG+giSVJuZKKaa65ZRr4qGRZaiWJIm6puJGkovCgolgKmAqjpqz374++nJ8jqEDAoOf9uq65nubcn3PO59zQw7tzzpxRCCEEiIiIiGTMyNANEBERERkaAxERERHJHgMRERERyR4DEREREckeAxERERHJHgMRERERyR4DEREREckeAxERERHJHgMRERERyR4DERGVC2dnZwwfPtzQbdBzLF68GHXr1oWxsTGaN28OoPg/t7CwMCgUCly7dq1ceySqKAxERKWQnJwMf39/NGjQAFWqVEGVKlXg5uYGPz8/nDt3Tq92zpw5UCgUMDIywo0bNwptS6fTwcLCAgqFAv7+/tLya9euQaFQQKFQYN68eUX2MXjwYCgUClhZWZXtAVKpXbx4EXPmzKn0QWHfvn0IDAxEu3btEBoaigULFhi6JSKDMjF0A0Qvm927d2PgwIEwMTHB4MGD0axZMxgZGSEhIQE7duzA2rVrkZycDCcnJ731lEolvvnmGwQGBuot37Fjx3P3Z25ujm+++QYzZszQW/7w4UP8+OOPMDc3L5sDK2OJiYkwMpLff3NdvHgRc+fORefOneHs7Gzodp7pwIEDMDIywsaNG2FmZiYtl+vPjYi/9UQlcOXKFfj4+MDJyQkJCQlYs2YNxo0bhzFjxmDJkiVISkrC8uXLi/yD0qtXL3zzzTeFloeHh8Pb2/uZ++zVqxcuXryI33//XW/5jz/+iOzsbLz55pv//sDKgVKphKmpqaHboGdIT0+HhYWFXhgC+HMj+WIgIiqBRYsW4eHDhwgNDYW9vX2hcRMTE0yePBmOjo6FxgYNGoS4uDgkJCRIy1JTU3HgwAEMGjTomfvUarVwcXFBeHi43vKtW7eiR48eqF69erF6P3fuHIYPH466devC3NwcGo0GI0eOxJ07d/Tq7t+/jylTpsDZ2RlKpRK2trZ488038dtvv0k1SUlJ6NevHzQaDczNzVG7dm34+PggMzNTqinqXpRz586hU6dOsLCwQO3atTFv3jyEhoYWuhfF2dkZvXv3xtGjR9G6dWuYm5ujbt26+O9//6u3vYL7WI4ePYrJkyfDxsYG1tbWGDduHLKzs5GRkYFhw4ahWrVqqFatGgIDAyGE0NtGfn4+li9fjsaNG8Pc3Bx2dnYYN24c7t27p1dXnJ7CwsLw7rvvAgC6dOkiXfI8dOjQc382CQkJGDBgAGxsbGBhYYGGDRvik08+0as5e/YsevbsCZVKBSsrK3Tr1g0nTpwocj6OHTuGqVOnwsbGBpaWlvjPf/6D27dvS3UKhQKhoaF4+PCh1GNYWNgzf27x8fHo2rWr3s8tPz+/yGPZs2cPOnToAEtLS1StWhXe3t6Ij4/Xqxk+fDisrKzw559/ok+fPrCysoKNjQ0+/PBD5OXl6dXm5+djxYoVaNKkCczNzWFjY4MePXrgzJkzenVbtmyBu7s7LCwsUL16dfj4+BR5iZroWXjJjKgEdu/ejfr168PDw6PE63bs2BG1a9dGeHg4goODAQDffvstrKysnnuGCADee+89bNmyBQsXLoRCocBff/2Fffv2YfPmzYiMjCzW/qOionD16lWMGDECGo0G8fHxWL9+PeLj43HixAkoFAoAwPjx4/H999/D398fbm5uuHPnDo4ePYpLly6hRYsWyM7OhpeXF7KysjBp0iRoNBr8+eef2L17NzIyMqBWq4vc/59//imFhKCgIFhaWmLDhg1QKpVF1l++fBn9+/fHqFGj4Ovri6+//hrDhw+Hu7s7GjdurFdb0MfcuXNx4sQJrF+/HtbW1jh+/Djq1KmDBQsW4JdffsHixYvx+uuvY9iwYdK648aNQ1hYGEaMGIHJkycjOTkZq1atwtmzZ3Hs2DG9syUv6qljx46YPHkyVq5ciY8//hiurq4AIP1vUc6dO4cOHTrA1NQUY8eOhbOzM65cuYKff/4Z8+fPB/BPIOnQoQNUKhUCAwNhamqKL7/8Ep07d8bhw4cL/T5OmjQJ1apVw+zZs3Ht2jUsX74c/v7++PbbbwEAmzdvxvr163Hq1Cls2LABANC2bdsi+0tNTUWXLl2Qm5uL6dOnw9LSEuvXr4eFhUWh2s2bN8PX1xdeXl747LPP8Pfff2Pt2rVo3749zp49q3cJMS8vD15eXvDw8MDnn3+O/fv3Y8mSJahXrx4mTJgg1Y0aNQphYWHo2bMnRo8ejdzcXPz66684ceIEWrZsCQCYP38+Zs6ciQEDBmD06NG4ffs2vvjiC3Ts2BFnz56FtbX1M+efSCKIqFgyMzMFANGnT59CY/fu3RO3b9+WXn///bc0Nnv2bAFA3L59W3z44Yeifv360lirVq3EiBEjhBBCABB+fn7SWHJysgAgFi9eLC5cuCAAiF9//VUIIcTq1auFlZWVePjwofD19RWWlpYv7P/Jngp88803AoA4cuSItEytVuv18bSzZ88KAGL79u3P3Z+Tk5Pw9fWV3k+aNEkoFApx9uxZadmdO3dE9erVBQCRnJyst+7TfaWnpwulUik++OADaVloaKgAILy8vER+fr60XKvVCoVCIcaPHy8ty83NFbVr1xadOnWSlv36668CgNi6date75GRkYWWF7en7du3CwDi4MGDz52fAh07dhRVq1YV169f11v+5PH06dNHmJmZiStXrkjLbt68KapWrSo6duxYaD48PT311g8ICBDGxsYiIyNDWvas35unf25TpkwRAMTJkyf1jlutVuv93O7fvy+sra3FmDFj9LaXmpoq1Gq13nJfX18BQAQHB+vVvvHGG8Ld3V16f+DAAQFATJ48uVCfBcd37do1YWxsLObPn683fv78eWFiYlJoOdGz8JIZUTHpdDoAKPITXZ07d4aNjY30Wr16dZHbGDRoEC5fvozTp09L//u8y2UFGjdujKZNm0r3IIWHh+Odd95BlSpVit3/k/9F//jxY/z1119o06YNAOhdDrO2tsbJkydx8+bNIrdTcAZo7969+Pvvv4u9/8jISGi1Wunj3QBQvXp1DB48uMh6Nzc3dOjQQXpvY2ODhg0b4urVq4VqR40aJZ3hAgAPDw8IITBq1ChpmbGxMVq2bKm3/vbt26FWq/Hmm2/ir7/+kl7u7u6wsrLCwYMHS91Tcdy+fRtHjhzByJEjUadOHb2xguPJy8vDvn370KdPH9StW1cat7e3x6BBg3D06FHpd7PA2LFj9eajQ4cOyMvLw/Xr10vc4y+//II2bdqgdevW0jIbG5tCP7eoqChkZGTgvffe05tLY2NjeHh4FJpL4J+zkU/q0KGD3lz+8MMPUCgUmD17dqF1C45vx44dyM/Px4ABA/T2q9Fo8NprrxW5X6Ki8JIZUTFVrVoVAPDgwYNCY19++SXu37+PtLQ0DBky5JnbeOONN9CoUSOEh4fD2toaGo0GXbt2Ldb+Bw0ahCVLliAgIADHjx/Hxx9/XKL+7969i7lz52Lbtm1IT0/XG3vy3p9FixbB19cXjo6OcHd3R69evTBs2DDpj7GLiwumTp2KpUuXYuvWrejQoQPefvttDBky5JmXywDg+vXr0Gq1hZbXr1+/yPqnAwIAVKtWrdC9PUXVFvTx9L1carVab/2kpCRkZmbC1ta2yB6enqeS9FQcBX/8X3/99WfW3L59G3///TcaNmxYaMzV1RX5+fm4ceOG3mXEp/usVq0aAJSqz+vXrxd5ifjpfpKSkgDgmb/PKpVK733B/UBP9/lkj1euXIGDg8Nz75NLSkqCEAKvvfZakeO8QZyKi4GIqJjUajXs7e1x4cKFQmMFfzCK8+yZQYMGYe3atahatSoGDhxY7I84v/feewgKCsKYMWNQo0YNdO/evUT9DxgwAMe
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Share of ambiguous assignments (max prob < 0.6): 1.10%\n"
]
}
],
"source": [
"gmm = models[best_k_gmm]\n",
"labels_gmm = gmm.predict(X_scaled)\n",
"proba = gmm.predict_proba(X_scaled)\n",
"max_proba = proba.max(axis=1)\n",
"\n",
"dfc.loc[X.index, \"cluster_gmm\"] = labels_gmm\n",
"dfc.loc[X.index, \"cluster_confidence\"] = max_proba\n",
"\n",
"# Confidence histogram\n",
"plt.figure()\n",
"plt.hist(max_proba, bins=50)\n",
"plt.xlabel(\"Max posterior probability\")\n",
"plt.ylabel(\"Count\")\n",
"plt.title(\"GMM assignment confidence\")\n",
"plt.show()\n",
"\n",
"ambiguous_share = (max_proba < 0.6).mean()\n",
"print(f\"Share of ambiguous assignments (max prob < 0.6): {ambiguous_share:.2%}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "d4254585-959f-4f79-9c08-73e6d7be25c5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_clients</th>\n",
" <th>aum_qty_med</th>\n",
" <th>freq_med</th>\n",
" <th>rel_int_med</th>\n",
" <th>gross_flow_med</th>\n",
" <th>n_tx_med</th>\n",
" <th>vol_med</th>\n",
" <th>conf_med</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster_gmm</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9.0</th>\n",
" <td>3509</td>\n",
" <td>7.949166e+01</td>\n",
" <td>0.041667</td>\n",
" <td>3.071429e+00</td>\n",
" <td>6.687053e+00</td>\n",
" <td>2.0</td>\n",
" <td>3.277044e+01</td>\n",
" <td>0.998364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1.0</th>\n",
" <td>2953</td>\n",
" <td>9.121500e+01</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.999999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6.0</th>\n",
" <td>1426</td>\n",
" <td>5.339202e+02</td>\n",
" <td>0.224745</td>\n",
" <td>2.483775e+00</td>\n",
" <td>2.762873e+01</td>\n",
" <td>38.0</td>\n",
" <td>7.746854e+01</td>\n",
" <td>0.999916</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5.0</th>\n",
" <td>787</td>\n",
" <td>1.454968e+04</td>\n",
" <td>0.673267</td>\n",
" <td>5.310471e+00</td>\n",
" <td>1.064410e+03</td>\n",
" <td>485.0</td>\n",
" <td>1.867233e+03</td>\n",
" <td>0.999937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0.0</th>\n",
" <td>568</td>\n",
" <td>5.309168e+03</td>\n",
" <td>0.087500</td>\n",
" <td>6.195235e+00</td>\n",
" <td>6.734335e+02</td>\n",
" <td>5.0</td>\n",
" <td>2.910116e+03</td>\n",
" <td>0.998759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8.0</th>\n",
" <td>223</td>\n",
" <td>1.840341e+05</td>\n",
" <td>0.680851</td>\n",
" <td>6.853229e+00</td>\n",
" <td>1.521262e+04</td>\n",
" <td>418.0</td>\n",
" <td>3.652430e+04</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4.0</th>\n",
" <td>198</td>\n",
" <td>1.656004e+05</td>\n",
" <td>1.000000</td>\n",
" <td>4.315459e+00</td>\n",
" <td>8.092373e+03</td>\n",
" <td>3570.0</td>\n",
" <td>9.626152e+03</td>\n",
" <td>0.999323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7.0</th>\n",
" <td>77</td>\n",
" <td>1.109350e+00</td>\n",
" <td>0.130435</td>\n",
" <td>1.462797e+02</td>\n",
" <td>9.281811e+01</td>\n",
" <td>15.0</td>\n",
" <td>2.553362e+02</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2.0</th>\n",
" <td>1</td>\n",
" <td>7.941176e-05</td>\n",
" <td>0.014706</td>\n",
" <td>1.257232e+06</td>\n",
" <td>1.468240e+00</td>\n",
" <td>1.0</td>\n",
" <td>1.210741e+01</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3.0</th>\n",
" <td>1</td>\n",
" <td>1.319632e+08</td>\n",
" <td>1.000000</td>\n",
" <td>3.814096e+00</td>\n",
" <td>3.871695e+06</td>\n",
" <td>27668.0</td>\n",
" <td>1.194728e+07</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_clients aum_qty_med freq_med rel_int_med gross_flow_med \\\n",
"cluster_gmm \n",
"9.0 3509 7.949166e+01 0.041667 3.071429e+00 6.687053e+00 \n",
"1.0 2953 9.121500e+01 0.000000 0.000000e+00 0.000000e+00 \n",
"6.0 1426 5.339202e+02 0.224745 2.483775e+00 2.762873e+01 \n",
"5.0 787 1.454968e+04 0.673267 5.310471e+00 1.064410e+03 \n",
"0.0 568 5.309168e+03 0.087500 6.195235e+00 6.734335e+02 \n",
"8.0 223 1.840341e+05 0.680851 6.853229e+00 1.521262e+04 \n",
"4.0 198 1.656004e+05 1.000000 4.315459e+00 8.092373e+03 \n",
"7.0 77 1.109350e+00 0.130435 1.462797e+02 9.281811e+01 \n",
"2.0 1 7.941176e-05 0.014706 1.257232e+06 1.468240e+00 \n",
"3.0 1 1.319632e+08 1.000000 3.814096e+00 3.871695e+06 \n",
"\n",
" n_tx_med vol_med conf_med \n",
"cluster_gmm \n",
"9.0 2.0 3.277044e+01 0.998364 \n",
"1.0 0.0 0.000000e+00 0.999999 \n",
"6.0 38.0 7.746854e+01 0.999916 \n",
"5.0 485.0 1.867233e+03 0.999937 \n",
"0.0 5.0 2.910116e+03 0.998759 \n",
"8.0 418.0 3.652430e+04 1.000000 \n",
"4.0 3570.0 9.626152e+03 0.999323 \n",
"7.0 15.0 2.553362e+02 1.000000 \n",
"2.0 1.0 1.210741e+01 1.000000 \n",
"3.0 27668.0 1.194728e+07 1.000000 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g_profile = (\n",
" dfc.loc[X.index]\n",
" .groupby(\"cluster_gmm\")\n",
" .agg(\n",
" n_clients=(ID_COL, \"count\"),\n",
" aum_qty_med=(\"aum_qty_mean\", \"median\"),\n",
" freq_med=(\"frequency\", \"median\"),\n",
" rel_int_med=(\"rel_intensity_total\", \"median\"),\n",
" gross_flow_med=(\"gross_flow_qty_mean\", \"median\"),\n",
" n_tx_med=(\"n_tx_total\", \"median\"),\n",
" vol_med=(\"net_flow_qty_vol\", \"median\"),\n",
" conf_med=(\"cluster_confidence\", \"median\"),\n",
" )\n",
" .sort_values(\"n_clients\", ascending=False)\n",
")\n",
"\n",
"g_profile\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "84d1790a-7901-473e-b367-b8464886ada2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Silhouette K-means: 0.5887519415047723\n",
"Silhouette GMM: 0.138325887586186\n"
]
}
],
"source": [
"#comapraison kmeans et gmm\n",
"\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"sil_km = silhouette_score(X_scaled, dfc.loc[X.index, \"cluster_kmeans\"])\n",
"sil_gmm = silhouette_score(X_scaled, dfc.loc[X.index, \"cluster_gmm\"])\n",
"\n",
"print(\"Silhouette K-means:\", sil_km)\n",
"print(\"Silhouette GMM:\", sil_gmm)\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c4576042-09e9-4f03-b99b-883d8fa3426f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DB index K-means (lower better): 0.5656170003935967\n",
"DB index GMM (lower better): 1.4187135041840173\n"
]
}
],
"source": [
"''' \n",
"DaviesBouldin index (plus bas = mieux)\n",
"Mesure similarité entre clusters.\n",
"Plus bas = clusters plus distincts.'''\n",
"\n",
"from sklearn.metrics import davies_bouldin_score\n",
"\n",
"db_km = davies_bouldin_score(X_scaled, dfc.loc[X.index, \"cluster_kmeans\"])\n",
"db_gmm = davies_bouldin_score(X_scaled, dfc.loc[X.index, \"cluster_gmm\"])\n",
"\n",
"print(\"DB index K-means (lower better):\", db_km)\n",
"print(\"DB index GMM (lower better):\", db_gmm)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d4d8264-359f-40c6-bb4d-4e26d1d74621",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}