Project_Carmignac/clustering_global.ipynb

2065 lines
1.1 MiB
Plaintext
Raw Normal View History

2026-04-05 17:52:42 +02:00
{
"cells": [
{
"cell_type": "markdown",
"id": "f6ea29f1",
"metadata": {},
"source": [
"# Global Clustering \n",
"\n",
"**Sections:**\n",
"1. Imports & Data Loading\n",
"2. Monthly Panel Construction\n",
"3. Feature Engineering (base + enriched)\n",
"4. Global Clustering (all active accounts)"
]
},
{
"cell_type": "markdown",
"id": "e727f666",
"metadata": {},
"source": [
"## 0. Imports & Data Loading"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9314f229-0b5d-4a4c-846c-869847d32c73",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'UMMV3Z72A70MCCSRV17O'\n",
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'wBFxaez78UPNW3BtchZOf4f238ZNXKnCexeGufaa'\n",
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJVTU1WM1o3MkE3ME1DQ1NSVjE3TyIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc3NTEzNTA4NiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NzYzNDQ3NDksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc3NTEzNTE0OCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJlZGY1ZDQ1OC1hYzkxLTQ5NTAtYmI5Ny0zNjMwNWY1MTQwYTIiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjMzMjg4YjJjLTlhMjAtNDNhOS1iMDlhLTdlMjc1OWQ1NjIxNiIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.rffoTJijRiGK2DCDhXj5y8R31DRH1LWkTwuH_1lvU9qN_xJSTmBIM4uGR_zp7XpMnq_ePwVhlkoWN15cNUgjMA'\n",
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "61e33897",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.preprocessing import RobustScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score, davies_bouldin_score, pairwise_distances\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"sns.set_style(\"whitegrid\")\n",
"pd.set_option(\"display.max_columns\", 100)\n",
"\n",
"EPS = 1e-9\n",
"RANDOM_STATE = 42\n",
"\n",
"# Column names\n",
"ID_COL = \"Registrar Account - ID\"\n",
"ISIN_COL = \"Product - Isin\"\n",
"FUND_COL = \"Product - Fund\"\n",
"ASSET_COL = \"Product - Asset Type\"\n",
"FLOW_DATE_COL = \"Centralisation Date\"\n",
"AUM_DATE_COL = \"Centralisation Date\"\n",
"FLOW_QTY_COL = \"Quantity - NetFlows\"\n",
"FLOW_SUB_COL = \"Quantity - Subscription\"\n",
"FLOW_RED_COL = \"Quantity - Redemption\"\n",
"AUM_QTY_COL = \"Quantity - AUM\"\n",
"AUM_VAL_COL = \"Value - AUM €\"\n",
"REGION_COL = \"Registrar Account - Region\"\n",
"COUNTRY_COL = \"RegistrarAccount - Country\"\n",
"NAV_DATE_COL = \"Dat\"\n",
"NAV_ISIN_COL = \"Isin\"\n",
"NAV_PRICE_COL = \"Price (TF PartPrice)\"\n",
"NAV_BENCH_COL = \"PriceBench\"\n",
"RATE_DATE_COL = \"Date\"\n",
"RATE_VAL_COL = \"Yld to Maturity\"\n",
"\n",
"#external data projet-bdc-data /carmignac /Data Modélisation /Nav\n",
"PATH_NAV = \"s3://projet-bdc-data/carmignac/Data Modélisation/Nav/NAV_Bench_data.csv\" #Cest la table de valorisation / performance du produit.\n",
"PATH_RATES = \"s3://projet-bdc-data/carmignac/Data Modélisation/market data/esterRates.csv\"\n",
"\n",
"# optional competitors\n",
"PATH_COMP_FLOWS = \"s3://projet-bdc-data/carmignac/Data Modélisation/competitors/daily_estimated_flows.csv\"\n",
"PATH_COMP_PERF = \"s3://projet-bdc-data/carmignac/Data Modélisation/competitors/weekly_perf_full.csv\"\n",
"PATH_PEERS = \"s3://projet-bdc-carmignac-g3/peers/CAD_peers.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "eb3b2908",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"flows: (2574461, 26)\n",
"aum: (4880297, 19)\n",
"nav: (623914, 6)\n"
]
}
],
"source": [
"df_flows = pd.read_csv(\"flows.csv\", low_memory=False)\n",
"df_aum = df_aum = pd.read_csv(\"s3://projet-bdc-carmignac-g3/paco/AUM_repaired.csv\", low_memory=False)\n",
"df_nav = pd.read_csv(PATH_NAV, sep=\";\")\n",
"df_rates = pd.read_csv(PATH_RATES, sep=\";\")\n",
"\n",
"# Date parsing\n",
"for df, col in [\n",
" (df_flows, FLOW_DATE_COL), (df_aum, AUM_DATE_COL),\n",
" (df_nav, NAV_DATE_COL), (df_rates, RATE_DATE_COL)\n",
"]:\n",
" df[col] = pd.to_datetime(df[col], errors=\"coerce\")\n",
"\n",
"# Month column\n",
"for df, col in [(df_flows, FLOW_DATE_COL), (df_aum, AUM_DATE_COL),\n",
" (df_nav, NAV_DATE_COL), (df_rates, RATE_DATE_COL)]:\n",
" df[\"month\"] = df[col].dt.to_period(\"M\").dt.to_timestamp()\n",
"\n",
"# Numeric coercion\n",
"for col in [FLOW_QTY_COL, FLOW_SUB_COL, FLOW_RED_COL]:\n",
" df_flows[col] = pd.to_numeric(df_flows[col], errors=\"coerce\")\n",
"for col in [AUM_QTY_COL, AUM_VAL_COL]:\n",
" df_aum[col] = pd.to_numeric(df_aum[col], errors=\"coerce\")\n",
"for col in [NAV_PRICE_COL, NAV_BENCH_COL]:\n",
" df_nav[col] = pd.to_numeric(df_nav[col], errors=\"coerce\")\n",
"df_rates[RATE_VAL_COL] = pd.to_numeric(df_rates[RATE_VAL_COL], errors=\"coerce\")\n",
"\n",
"# ISIN as string\n",
"for df in [df_flows, df_aum]:\n",
" df[ISIN_COL] = df[ISIN_COL].astype(str).str.strip()\n",
"df_nav[NAV_ISIN_COL] = df_nav[NAV_ISIN_COL].astype(str).str.strip()\n",
"\n",
"print(\"flows:\", df_flows.shape)\n",
"print(\"aum: \", df_aum.shape)\n",
"print(\"nav: \", df_nav.shape)"
]
},
{
"cell_type": "markdown",
"id": "5929db69",
"metadata": {},
"source": [
"## 1. Monthly Panel Construction\n",
"\n",
"Build a full outer join of AUM and flows at (account, ISIN, month) granularity,\n",
"then enrich with NAV performance and macro rates."
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "d36d0a70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Panel shape: (4791501, 20)\n"
]
}
],
"source": [
"# --- Flows aggregated to monthly ---\n",
"df_flows_m = (\n",
" df_flows\n",
" .dropna(subset=[ID_COL, ISIN_COL, \"month\"])\n",
" .assign(\n",
" gross_flow_qty = lambda x: x[FLOW_QTY_COL].abs(),\n",
" sub_qty = lambda x: x[FLOW_SUB_COL].fillna(0),\n",
" red_qty = lambda x: x[FLOW_RED_COL].fillna(0)\n",
" )\n",
" .groupby([ID_COL, ISIN_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" net_flow_qty = (FLOW_QTY_COL, \"sum\"),\n",
" gross_flow_qty = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty = (\"sub_qty\", \"sum\"),\n",
" red_qty = (\"red_qty\", \"sum\"),\n",
" n_tx = (FLOW_QTY_COL, \"size\"),\n",
" region = (REGION_COL, \"last\"),\n",
" country = (COUNTRY_COL, \"last\")\n",
" )\n",
")\n",
"\n",
"# --- AUM aggregated to monthly ---\n",
"df_aum_m = (\n",
" df_aum\n",
" .dropna(subset=[ID_COL, ISIN_COL, \"month\"])\n",
" .groupby([ID_COL, ISIN_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" aum_qty = (AUM_QTY_COL, \"sum\"),\n",
" aum_val = (AUM_VAL_COL, \"sum\"),\n",
" fund = (FUND_COL, \"last\"),\n",
" asset_type = (ASSET_COL, \"last\"),\n",
" region = (REGION_COL, \"last\"),\n",
" country = (COUNTRY_COL, \"last\")\n",
" )\n",
")\n",
"\n",
"# --- Full outer join ---\n",
"keys = pd.concat([\n",
" df_flows_m[[ID_COL, ISIN_COL, \"month\"]],\n",
" df_aum_m[[ID_COL, ISIN_COL, \"month\"]]\n",
"]).drop_duplicates()\n",
"\n",
"df_rel_m = (\n",
" keys\n",
" .merge(df_aum_m, on=[ID_COL, ISIN_COL, \"month\"], how=\"left\")\n",
" .merge(df_flows_m, on=[ID_COL, ISIN_COL, \"month\"], how=\"left\", suffixes=(\"\", \"_flow\"))\n",
")\n",
"\n",
"for c in [\"aum_qty\",\"aum_val\",\"net_flow_qty\",\"gross_flow_qty\",\"sub_qty\",\"red_qty\",\"n_tx\"]:\n",
" df_rel_m[c] = df_rel_m[c].fillna(0)\n",
"\n",
"df_rel_m[\"region\"] = df_rel_m[\"region\"].fillna(df_rel_m.get(\"region_flow\"))\n",
"df_rel_m[\"country\"] = df_rel_m[\"country\"].fillna(df_rel_m.get(\"country_flow\"))\n",
"\n",
"# --- Active / holding flags ---\n",
"df_rel_m[\"active_rel_month\"] = (df_rel_m[\"gross_flow_qty\"] > 0).astype(int)\n",
"df_rel_m[\"holding_rel_month\"] = (df_rel_m[\"aum_qty\"] > 0).astype(int)\n",
"df_rel_m[\"flow_to_aum_rel\"] = df_rel_m[\"net_flow_qty\"] / (df_rel_m[\"aum_qty\"].abs() + EPS)\n",
"df_rel_m[\"turnover_rel\"] = df_rel_m[\"gross_flow_qty\"] / (df_rel_m[\"aum_qty\"].abs() + EPS)\n",
"\n",
"print(\"Panel shape:\", df_rel_m.shape)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "965d2564",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enriched panel shape: (4791501, 24)\n"
]
}
],
"source": [
"# --- NAV & benchmark returns ---\n",
"df_nav_m = (\n",
" df_nav\n",
" .dropna(subset=[NAV_ISIN_COL, \"month\", NAV_PRICE_COL])\n",
" .sort_values([NAV_ISIN_COL, \"month\"])\n",
" .groupby([NAV_ISIN_COL, \"month\"], as_index=False)\n",
" .tail(1).copy()\n",
")\n",
"df_nav_m[\"ret_fund_m\"] = df_nav_m.groupby(NAV_ISIN_COL)[NAV_PRICE_COL].pct_change()\n",
"df_nav_m[\"ret_bench_m\"] = df_nav_m.groupby(NAV_ISIN_COL)[NAV_BENCH_COL].pct_change()\n",
"df_nav_m[\"active_return_m\"] = df_nav_m[\"ret_fund_m\"] - df_nav_m[\"ret_bench_m\"]\n",
"df_nav_m = df_nav_m.rename(columns={NAV_ISIN_COL: ISIN_COL})[\n",
" [ISIN_COL, \"month\", \"ret_fund_m\", \"ret_bench_m\", \"active_return_m\"]\n",
"]\n",
"\n",
"# --- Interest rates ---\n",
"df_rates_m = (\n",
" df_rates\n",
" .dropna(subset=[\"month\", RATE_VAL_COL])\n",
" .sort_values(RATE_DATE_COL)\n",
" .groupby(\"month\", as_index=False).tail(1).copy()\n",
")\n",
"df_rates_m[\"delta_rate_m\"] = df_rates_m[RATE_VAL_COL].diff()\n",
"df_rates_m = df_rates_m[[\"month\", RATE_VAL_COL, \"delta_rate_m\"]]\n",
"\n",
"# --- Merge into panel ---\n",
"df_rel_m = df_rel_m.merge(df_nav_m, on=[ISIN_COL, \"month\"], how=\"left\")\n",
"df_rel_m = df_rel_m.merge(df_rates_m[[\"month\",\"delta_rate_m\"]], on=\"month\", how=\"left\")\n",
"\n",
"for c in [\"ret_fund_m\",\"ret_bench_m\",\"active_return_m\",\"delta_rate_m\"]:\n",
" df_rel_m[c] = df_rel_m[c].fillna(0)\n",
"\n",
"print(\"Enriched panel shape:\", df_rel_m.shape)"
]
},
{
"cell_type": "markdown",
"id": "1ef7bba8",
"metadata": {},
"source": [
"## 2. Feature Engineering\n",
"\n",
"### 2a. Monthly account-level aggregation\n",
"### 2b. ISIN-level features (where / when investors put their money)\n",
"### 2c. Asset type & fund composition features\n",
"### 2d. Rolling metrics (1M / 3M / 6M)\n",
"### 2e. Behavioural features (entry/exit, momentum, rate sensitivity)\n",
"### 2f. Trend & beta features"
]
},
{
"cell_type": "code",
"execution_count": 168,
"id": "db5a297c-78ea-4048-98f8-624612fbb60d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df_month shape: (931333, 21)\n",
"ISIN-level client features: (12584, 12)\n",
"Asset shares: (7475, 6)\n",
"Fund shares: (6562, 11)\n",
"Rolling features: (12584, 3)\n",
"df_client shape: (12584, 46)\n",
"After outlier removal: 7179 accounts\n",
"dfc shape: (7179, 60)\n",
"gross_flow_to_aum: min=0.0000, max=270698.4708, nan=336, inf=0\n",
"flow_direction_balance: min=-1.0000, max=1.0000, nan=1069, inf=0\n",
"sub_share_mean: min=-0.0985, max=77.5673, nan=1069, inf=0\n",
"redemption_bias: min=-154.1345, max=0.0828, nan=1069, inf=0\n",
"entry_rate_per_isin: min=0.0217, max=5.0000, nan=0, inf=0\n",
"aum_final_to_peak: min=-2.4841, max=1.0000, nan=0, inf=0\n",
"flow_roll3m_norm: min=-4935000000000.0000, max=1400000000000.0000, nan=0, inf=0\n",
"flow_roll6m_norm: min=-8699999999999.9990, max=322428000000.0000, nan=0, inf=0\n"
]
}
],
"source": [
"# ============================================================\n",
"# 2a. Monthly account-level panel\n",
"# ============================================================\n",
"tmp = df_rel_m.copy()\n",
"tmp[\"isin_held_flag\"] = (tmp[\"aum_qty\"] > 0).astype(int)\n",
"tmp[\"isin_active_flag\"] = (tmp[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"df_month = (\n",
" tmp.groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" aum_qty = (\"aum_qty\", \"sum\"),\n",
" aum_val = (\"aum_val\", \"sum\"),\n",
" net_flow_qty = (\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty = (\"sub_qty\", \"sum\"),\n",
" red_qty = (\"red_qty\", \"sum\"),\n",
" n_tx = (\"n_tx\", \"sum\"),\n",
" n_isin_held = (\"isin_held_flag\", \"sum\"),\n",
" n_isin_active = (\"isin_active_flag\",\"sum\"),\n",
" delta_rate_m = (\"delta_rate_m\", \"first\"),\n",
" region = (\"region\", \"first\"),\n",
" country = (\"country\", \"first\"),\n",
" )\n",
" .sort_values([ID_COL, \"month\"])\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"df_month[\"active_month\"] = (df_month[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"df_month[\"flow_to_aum_m\"] = np.where(\n",
" df_month[\"aum_qty\"].abs() > 0,\n",
" df_month[\"net_flow_qty\"] / df_month[\"aum_qty\"].abs(),\n",
" np.nan\n",
")\n",
"df_month[\"turnover_m\"] = np.where(\n",
" df_month[\"aum_qty\"].abs() > 0,\n",
" df_month[\"gross_flow_qty\"] / df_month[\"aum_qty\"].abs(),\n",
" np.nan\n",
")\n",
"df_month[\"sub_share_m\"] = np.where(\n",
" df_month[\"gross_flow_qty\"] > 0,\n",
" df_month[\"sub_qty\"] / df_month[\"gross_flow_qty\"],\n",
" np.nan\n",
")\n",
"df_month[\"red_share_m\"] = np.where(\n",
" df_month[\"gross_flow_qty\"] > 0,\n",
" df_month[\"red_qty\"] / df_month[\"gross_flow_qty\"],\n",
" np.nan\n",
")\n",
"df_month[\"aum_peak_to_date\"] = df_month.groupby(ID_COL)[\"aum_qty\"].cummax()\n",
"df_month[\"aum_drawdown\"] = np.where(\n",
" df_month[\"aum_peak_to_date\"] > 0,\n",
" 1 - df_month[\"aum_qty\"] / df_month[\"aum_peak_to_date\"],\n",
" np.nan\n",
")\n",
"\n",
"print(\"df_month shape:\", df_month.shape)\n",
"\n",
"# ============================================================\n",
"# 2b. ISIN-level features\n",
"# ============================================================\n",
"tmp = df_rel_m.sort_values([ID_COL, ISIN_COL, \"month\"]).copy()\n",
"tmp[\"prev_aum\"] = tmp.groupby([ID_COL, ISIN_COL])[\"aum_qty\"].shift(1)\n",
"tmp[\"entry_event\"] = ((tmp[\"prev_aum\"].fillna(0) <= 0) & (tmp[\"aum_qty\"] > 0)).astype(int)\n",
"tmp[\"full_exit_event\"] = ((tmp[\"prev_aum\"] > 0) & (tmp[\"aum_qty\"] <= 0)).astype(int)\n",
"\n",
"# Lag de 1 mois pour la réaction à la performance (causalité correcte)\n",
"tmp[\"ret_fund_m_lag1\"] = tmp.groupby([ID_COL, ISIN_COL])[\"ret_fund_m\"].shift(1)\n",
"tmp[\"buy_on_perf\"] = ((tmp[\"net_flow_qty\"] > 0) & (tmp[\"ret_fund_m_lag1\"] > 0)).astype(int)\n",
"tmp[\"sell_on_perf\"] = ((tmp[\"net_flow_qty\"] < 0) & (tmp[\"ret_fund_m_lag1\"] < 0)).astype(int)\n",
"\n",
"df_rel_feat = (\n",
" tmp.groupby([ID_COL, ISIN_COL], as_index=False)\n",
" .agg(\n",
" rel_n_months = (\"month\", \"nunique\"),\n",
" rel_active_months = (\"active_rel_month\", \"sum\"),\n",
" rel_holding_months = (\"holding_rel_month\", \"sum\"),\n",
" rel_aum_mean = (\"aum_qty\", \"mean\"),\n",
" rel_turnover_mean = (\"turnover_rel\", \"mean\"),\n",
" rel_turnover_vol = (\"turnover_rel\", \"std\"),\n",
" rel_flow_to_aum_vol = (\"flow_to_aum_rel\", \"std\"),\n",
" rel_n_tx = (\"n_tx\", \"sum\"),\n",
" rel_full_exit_count = (\"full_exit_event\", \"sum\"),\n",
" rel_entry_count = (\"entry_event\", \"sum\"),\n",
" buy_on_perf_rate = (\"buy_on_perf\", \"mean\"),\n",
" sell_on_perf_rate = (\"sell_on_perf\", \"mean\"),\n",
" )\n",
")\n",
"\n",
"isin_aum = df_rel_feat.groupby(ID_COL)[\"rel_aum_mean\"].transform(\"sum\")\n",
"df_rel_feat[\"isin_weight\"] = np.where(\n",
" isin_aum > 0,\n",
" df_rel_feat[\"rel_aum_mean\"] / isin_aum,\n",
" np.nan\n",
")\n",
"hhi_isin = (\n",
" df_rel_feat.groupby(ID_COL)[\"isin_weight\"]\n",
" .apply(lambda w: np.sum(w**2))\n",
" .reset_index(name=\"hhi_isin\")\n",
")\n",
"\n",
"df_rel_client = (\n",
" df_rel_feat.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" n_isin_total = (ISIN_COL, \"nunique\"),\n",
" rel_turnover_mean_avg = (\"rel_turnover_mean\", \"mean\"),\n",
" rel_turnover_vol_avg = (\"rel_turnover_vol\", \"mean\"),\n",
" rel_flow_to_aum_vol_avg = (\"rel_flow_to_aum_vol\",\"mean\"),\n",
" full_exit_count = (\"rel_full_exit_count\",\"sum\"),\n",
" entry_count = (\"rel_entry_count\", \"sum\"),\n",
" avg_holding_months_per_isin = (\"rel_holding_months\", \"mean\"),\n",
" max_holding_months_per_isin = (\"rel_holding_months\", \"max\"),\n",
" buy_on_perf_rate_avg = (\"buy_on_perf_rate\", \"mean\"),\n",
" sell_on_perf_rate_avg = (\"sell_on_perf_rate\", \"mean\"),\n",
" )\n",
" .merge(hhi_isin, on=ID_COL, how=\"left\")\n",
")\n",
"\n",
"print(\"ISIN-level client features:\", df_rel_client.shape)\n",
"\n",
"# ============================================================\n",
"# 2c. Asset type & fund composition features\n",
"# ============================================================\n",
"aum_by_asset = (\n",
" df_aum.dropna(subset=[ID_COL, ASSET_COL])\n",
" .groupby([ID_COL, ASSET_COL], as_index=False)[AUM_VAL_COL].sum()\n",
")\n",
"total_aum_acc = aum_by_asset.groupby(ID_COL)[AUM_VAL_COL].sum().rename(\"total_aum\")\n",
"aum_by_asset = aum_by_asset.merge(total_aum_acc, on=ID_COL)\n",
"aum_by_asset[\"share\"] = np.where(\n",
" aum_by_asset[\"total_aum\"] > 0,\n",
" aum_by_asset[AUM_VAL_COL] / aum_by_asset[\"total_aum\"],\n",
" np.nan\n",
")\n",
"asset_shares = (\n",
" aum_by_asset.pivot_table(index=ID_COL, columns=ASSET_COL, values=\"share\", aggfunc=\"mean\")\n",
" .fillna(0).reset_index()\n",
")\n",
"asset_shares.columns = [ID_COL] + [\n",
" f\"share_asset_{c.lower().replace(' ','_')}\" for c in asset_shares.columns[1:]\n",
"]\n",
"\n",
"aum_by_fund = (\n",
" df_aum.dropna(subset=[ID_COL, FUND_COL])\n",
" .groupby([ID_COL, FUND_COL], as_index=False)[AUM_VAL_COL].sum()\n",
")\n",
"aum_by_fund = aum_by_fund.merge(total_aum_acc, on=ID_COL)\n",
"aum_by_fund[\"share\"] = np.where(\n",
" aum_by_fund[\"total_aum\"] > 0,\n",
" aum_by_fund[AUM_VAL_COL] / aum_by_fund[\"total_aum\"],\n",
" np.nan\n",
")\n",
"top_funds = aum_by_fund.groupby(FUND_COL)[AUM_VAL_COL].sum().nlargest(10).index\n",
"fund_shares = (\n",
" aum_by_fund[aum_by_fund[FUND_COL].isin(top_funds)]\n",
" .pivot_table(index=ID_COL, columns=FUND_COL, values=\"share\", aggfunc=\"mean\")\n",
" .fillna(0).reset_index()\n",
")\n",
"fund_shares.columns = [ID_COL] + [\n",
" f\"share_fund_{c.lower().replace(' ','_')[:30]}\" for c in fund_shares.columns[1:]\n",
"]\n",
"\n",
"print(\"Asset shares:\", asset_shares.shape)\n",
"print(\"Fund shares: \", fund_shares.shape)\n",
"\n",
"# ============================================================\n",
"# 2d. Rolling metrics — supprimées car trop sparse (80-90% zéros)\n",
"# On garde uniquement flow_roll3m et flow_roll6m comme signaux\n",
"# de tendance récente, normalisés par l'AUM\n",
"# ============================================================\n",
"df_month_s = df_month.sort_values([ID_COL, \"month\"]).copy()\n",
"\n",
"for w in [3, 6]:\n",
" df_month_s[f\"flow_roll{w}m_norm\"] = (\n",
" df_month_s.groupby(ID_COL)\n",
" .apply(lambda g: (\n",
" g[\"net_flow_qty\"].rolling(w, min_periods=1).sum() /\n",
" (g[\"aum_qty\"].abs().rolling(w, min_periods=1).mean() + EPS)\n",
" ))\n",
" .reset_index(level=0, drop=True)\n",
" )\n",
"\n",
"rolling_feats = (\n",
" df_month_s.groupby(ID_COL, as_index=False)\n",
" .last()[[ID_COL, \"flow_roll3m_norm\", \"flow_roll6m_norm\"]]\n",
")\n",
"\n",
"print(\"Rolling features:\", rolling_feats.shape)\n",
"\n",
"# ============================================================\n",
"# 2e. Static client features\n",
"# ============================================================\n",
"df_client = (\n",
" df_month.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" n_months = (\"month\", \"nunique\"),\n",
" n_active_months = (\"active_month\", \"sum\"),\n",
" flow_freq = (\"active_month\", \"mean\"),\n",
" aum_qty_mean = (\"aum_qty\", \"mean\"),\n",
" aum_qty_median = (\"aum_qty\", \"median\"),\n",
" aum_qty_max = (\"aum_qty\", \"max\"),\n",
" aum_qty_last = (\"aum_qty\", \"last\"),\n",
" net_flow_qty_sum = (\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty_sum = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty_sum = (\"sub_qty\", \"sum\"),\n",
" red_qty_sum = (\"red_qty\", \"sum\"),\n",
" n_tx_total = (\"n_tx\", \"sum\"),\n",
" avg_n_isin_held = (\"n_isin_held\", \"mean\"),\n",
" max_n_isin_held = (\"n_isin_held\", \"max\"),\n",
" aum_drawdown_last = (\"aum_drawdown\", \"last\"),\n",
" region = (\"region\", \"last\"),\n",
" country = (\"country\", \"last\"),\n",
" )\n",
")\n",
"\n",
"df_client = (\n",
" df_client\n",
" .merge(df_rel_client, on=ID_COL, how=\"left\")\n",
" .merge(asset_shares, on=ID_COL, how=\"left\")\n",
" .merge(fund_shares, on=ID_COL, how=\"left\")\n",
" .merge(rolling_feats, on=ID_COL, how=\"left\")\n",
")\n",
"\n",
"print(\"df_client shape:\", df_client.shape)\n",
"\n",
"# ============================================================\n",
"# 2f. Engineered ratios + filtres\n",
"# ============================================================\n",
"dfc = df_client.copy()\n",
"\n",
"dfc[\"log_aum_qty_mean\"] = np.log1p(dfc[\"aum_qty_mean\"].clip(lower=0))\n",
"\n",
"dfc[\"gross_flow_to_aum\"] = np.where(\n",
" dfc[\"aum_qty_mean\"] > 1,\n",
" dfc[\"gross_flow_qty_sum\"] / dfc[\"aum_qty_mean\"],\n",
" np.nan\n",
")\n",
"dfc[\"flow_direction_balance\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" dfc[\"net_flow_qty_sum\"] / dfc[\"gross_flow_qty_sum\"],\n",
" np.nan\n",
")\n",
"dfc[\"sub_share_mean\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" dfc[\"sub_qty_sum\"] / dfc[\"gross_flow_qty_sum\"],\n",
" np.nan\n",
")\n",
"dfc[\"redemption_bias\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" (dfc[\"red_qty_sum\"] - dfc[\"sub_qty_sum\"]) / dfc[\"gross_flow_qty_sum\"],\n",
" np.nan\n",
")\n",
"dfc[\"activity_intensity\"] = np.where(\n",
" dfc[\"n_months\"] > 0,\n",
" dfc[\"n_tx_total\"] / dfc[\"n_months\"],\n",
" np.nan\n",
")\n",
"dfc[\"exit_rate_per_isin\"] = np.where(\n",
" dfc[\"n_isin_total\"] > 0,\n",
" dfc[\"full_exit_count\"] / dfc[\"n_isin_total\"],\n",
" np.nan\n",
")\n",
"dfc[\"entry_rate_per_isin\"] = np.where(\n",
" dfc[\"n_isin_total\"] > 0,\n",
" dfc[\"entry_count\"] / dfc[\"n_isin_total\"],\n",
" np.nan\n",
")\n",
"dfc[\"aum_final_to_peak\"] = np.where(\n",
" dfc[\"aum_qty_max\"] > 0,\n",
" dfc[\"aum_qty_last\"] / dfc[\"aum_qty_max\"],\n",
" np.nan\n",
")\n",
"dfc[\"aum_drawdown_last\"] = dfc[\"aum_drawdown_last\"].clip(0, 1)\n",
"\n",
"# Log-transforms sur variables de taille brute (non utilisées en clustering)\n",
"for col in [\"aum_qty_mean\", \"gross_flow_qty_sum\", \"n_tx_total\"]:\n",
" dfc[f\"log_{col}\"] = np.log1p(dfc[col].clip(lower=0))\n",
"\n",
"# Filtres qualité\n",
"dfc = dfc[(dfc[\"n_months\"] >= 6) & (dfc[\"aum_qty_mean\"] > 0)].copy()\n",
"\n",
"# Retrait des outliers sur variables de taille\n",
"for col in [\"aum_qty_mean\", \"gross_flow_qty_sum\", \"n_tx_total\"]:\n",
" cap = dfc[col].quantile(0.99)\n",
" dfc = dfc[dfc[col] <= cap].copy()\n",
"\n",
"print(f\"After outlier removal: {len(dfc)} accounts\")\n",
"\n",
"# Regroupement géographique\n",
"top_countries = dfc[\"country\"].fillna(\"Unknown\").value_counts().head(10).index\n",
"top_regions = dfc[\"region\"].fillna(\"Unknown\").value_counts().head(10).index\n",
"dfc[\"country_grp\"] = np.where(dfc[\"country\"].isin(top_countries), dfc[\"country\"], \"Other\")\n",
"dfc[\"region_grp\"] = np.where(dfc[\"region\"].isin(top_regions), dfc[\"region\"], \"Other\")\n",
"\n",
"df_last_active = (\n",
" df_month[df_month[\"active_month\"] == 1]\n",
" .groupby(ID_COL)[\"month\"]\n",
" .max()\n",
" .reset_index(name=\"last_active_month\")\n",
")\n",
"reference_date = df_month[\"month\"].max()\n",
"df_last_active[\"months_since_last_tx\"] = (\n",
" (reference_date.to_period(\"M\") - df_last_active[\"last_active_month\"].dt.to_period(\"M\"))\n",
" .apply(lambda x: x.n)\n",
")\n",
"dfc = dfc.merge(df_last_active[[ID_COL, \"months_since_last_tx\"]], on=ID_COL, how=\"left\")\n",
"max_months = dfc[\"months_since_last_tx\"].max()\n",
"dfc[\"months_since_last_tx\"] = dfc[\"months_since_last_tx\"].fillna(max_months + 1)\n",
"\n",
"print(\"dfc shape:\", dfc.shape)"
]
},
{
"cell_type": "markdown",
"id": "d180b613",
"metadata": {},
"source": [
"## 3. Global Clustering (all active accounts)\n",
"\n",
"Baseline clustering on all accounts with sufficient history."
]
},
{
"cell_type": "markdown",
"id": "55ab41d3-20f6-4559-8e38-68420b4230b1",
"metadata": {},
"source": [
"**Feature set final retenu**\n",
"\n",
"- flow_freq : proportion de mois avec au moins une transaction — mesure la fréquence globale d'activité du compte\n",
"- gross_flow_to_aum : volume brut de flux rapporté à l'AUM moyen — mesure l'intensité des transactions indépendamment de leur direction, après clip p90 et log-transform\n",
"- n_isin_total : nombre total d'ISINs distincts détenus sur toute la période — capte l'étendue du portefeuille exploré\n",
"- avg_holding_months_per_isin : durée moyenne de détention par ISIN — capte la fidélité aux produits\n",
"- exit_rate_per_isin : nombre moyen de sorties complètes par ISIN — mesure la propension à liquider ses positions\n",
"- flow_direction_balance : ratio flux nets sur flux bruts — distingue les acheteurs nets des vendeurs nets sur l'ensemble de la période\n",
"- log_aum_qty_mean : logarithme de l'AUM moyen — seule variable de taille retenue, incluse pour distinguer des comportements identiques sur des niveaux d'engagement très différents\n",
"- months_since_last_tx : nombre de mois écoulés depuis la dernière transaction — variable de récence, la plus discriminante du set (ratio inter/total de 0.89)"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "353c7d48-4644-4427-ac4b-02e3f3e31690",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"flow_freq — min=0.0000, max=1.0000, valeurs > 1 : 0\n",
"Accounts: 7179 | Features: 8\n",
"Points > 5 std after scaling: 0 (0.0%)\n",
"\n",
"Features with most extreme values (>5 std):\n",
"Series([], )\n",
"\n",
"K=4 | sil=0.2312 | db=1.5109\n",
"\n",
"=== Tailles des clusters ===\n",
" n_comptes pct\n",
"cluster_k4 \n",
"0 2708 37.7\n",
"1 1174 16.4\n",
"2 1476 20.6\n",
"3 1821 25.4\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABMkAAAGGCAYAAABhZtaKAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XdYU9cbwPFvWCJ7ioKiAgIOUHDi1lptHa3aumfduOpqHXXUXeve2rr3qIp1a9Vq69aqdaC1jloXMmXLur8/KNEIaEAg/PT9PE8ezbnnnvsektwkb845V6UoioIQQgghhBBCCCGEEO8xPV0HIIQQQgghhBBCCCGErkmSTAghhBBCCCGEEEK89yRJJoQQQgghhBBCCCHee5IkE0IIIYQQQgghhBDvPUmSCSGEEEIIIYQQQoj3niTJhBBCCCGEEEIIIcR7T5JkQgghhBBCCCGEEOK9J0kyIYQQQgghhBBCCPHekySZEEIIIYQQQgghhHjvSZJMCCF0pH79+owYMULXYeSp7du34+HhwYMHD3QdihBCS3/++SflypXj4cOHug5F59LOYVeuXNF1KOI/GzdupG7duiQkJOg6FCGEEO8ASZIJIUQOu3//PmPHjuWDDz7Ay8sLX19f2rZty+rVq4mPj8+TGOLi4pg/fz5nzpzJk+P9v9q1axerVq3SdRj50vPnz1m1ahWtWrWiYsWKeHl50ahRIyZMmMDdu3d1HV6ue1efG0uWLOGXX37J0j6zZ8+mSZMmODk5qcs6depE06ZN09U9deoU5cuXp0WLFkRERLxtuGr379/Hy8vrvU1QHTt2jPnz5+s6jHypZcuWJCYmsmnTJl2HIoQQ4h0gSTIhhMhBv/76K82aNWPfvn3Uq1ePMWPGMHToUBwdHZk+fTqTJ0/Okzji4uJYsGABZ8+ezZPjaevTTz/lzz//1PiyrUu7d+9mzZo1ug4j3wkLC6Ndu3ZMnToVW1tbBg4cqE78HjlyhGbNmuk6xFz3rj43li5dmqUkWWBgICdPnqRt27ZvrHvq1Cn69OlDyZIlWblyJVZWVm8RqaYpU6ZgYGCQY+39vzl27BgLFizQdRj5UoECBWjevDmrVq1CURRdhyOEEOL/3Pv7aUMIIXLYv//+y+DBg3F0dGT16tUUKlRIva1Dhw78888//Prrr7oLMAfExsZiYmKS7f319fXR19fPwYjyp7i4OAoWLKjrMLJt5MiRBAYGMm/ePBo1aqSxbdCgQcyePVtHkeW+t32Ov2u2bduGo6MjFSpUeG29s2fP4u/vT4kSJXI8Qfbbb7/x+++/06NHDxYvXpxj7QKkpKSQmJhIgQIFcrRdkX3Pnz/H0NAQPT3tf8v/+OOPWbZsGadPn8bPzy8XoxNCCPGuk5FkQgiRQ5YtW0ZsbCyTJ0/WSJClKV68OF26dMl0//nz5+Ph4ZGuPKN1vK5cuUL37t2pWrUq3t7e1K9fn5EjRwLw4MED9ZeEBQsW4OHhgYeHh8ZUndu3bzNw4ECqVKmCl5cXLVu25PDhwxke9+zZs3z77bf4+flRp06d1/4N1q5dS5MmTShfvjyVK1emZcuW7Nq167V9SUlJYf78+dSsWZPy5cvTqVMn/v7773RrtqXte+HCBaZOnUq1atWoUKEC/fr1IywsTCOOX375hV69elGzZk3KlStHgwYNWLhwIcnJyeo6nTp14tdff+Xhw4fqv1H9+vUzjRPgzJkzeHh4aExjTZt2dvXqVTp06ED58uWZNWsWAAkJCcybN48PP/yQcuXKUadOHb7//vt0a+ecOHGCdu3aUalSJXx8fGjUqJG6jbx2+fJlfv31Vz7//PN0CTIAIyMjhg8frlF26tQp2rdvT4UKFahUqRL+/v7cvn1bo07a8/vu3bsMGzaMihUrUq1aNebMmYOiKDx+/Bh/f398fX2pUaMGK1as0Ng/7W+/d+9eZs2aRY0aNahQoQJ9+vTh8ePH6eLct28fLVu2xNvbm6pVqzJs2DCCgoI06owYMQIfHx/u379Pz5498fHxYdiwYa99boD2j6uHhwcTJkxg3759NG7cGG9vb9q0acPNmzcB2LRpEx9++CFeXl506tQpw7X6Ll++TPfu3alYsSLly5enY8eOXLhwIcO/7T///MOIESOoVKkSFStWZOTIkcTFxWnEExsby44dO9T9etO6iIcPH6ZatWqoVKpM65w/f57evXvj7OzMypUrsba2fm2bWZGYmMjkyZPp3Lkzzs7Ob91e2mPy888/06RJE7y8vPjtt98AuH79Oj169MDX1xcfHx+6dOnCpUuXMmwnPj6esWPHUrVqVXx9ffn666959uxZumNlNEXy1XNbYmIiCxYsoGHDhnh5eVG1alXatWvHiRMngNTn6fr169Vtpt0yk/Z8yOimzTqYbzqPAwQFBTFq1Cj1ObZ+/fqMGzdO4zXw77//qt9nypcvT+vWrdP9UJT2ut6zZw+zZ8+mVq1alC9fnujoaEC75z9AuXLlsLKySvc+JoQQQmSVjCQTQogccvToUYoVK4avr2+uHic0NJTu3btjbW1Nr169sLCw4MGDBxw6dAgAGxsbvv32W7799ls+/PBDPvzwQwD1l6pbt27Rrl07HBwc6NmzJyYmJuzbt49+/foxf/58df0048ePx8bGhn79+hEbG5tpXFu2bGHSpEk0atSIzp078/z5c27evMnly5dfOz1v5syZLFu2jHr16lGrVi1u3LhB9+7def78eYb1J02ahIWFBf379+fhw4esXr2aCRMmMGfOHHWdHTt2YGJiwhdffIGJiQmnT59m3rx5REdHqxM8ffr0ISoqiidPnqgTjKampm/462csIiKCnj170qRJEz755BNsbW1JSUnB39+fCxcu0Lp1a1xdXfnrr79YvXo19+7dY9GiRUDq49G7d288PDwYOHAgRkZG/PPPP/zxxx/ZiuVtHTlyBEidGquNkydP0rNnT4oWLUr//v2Jj49n3bp1tGvXju3bt1O0aFGN+oMHD8bV1ZWhQ4dy7NgxFi9ejJWVFZs2baJatWoMGzaMXbt2MW3aNLy8vKhcubLG/osXL0alUtGzZ09CQ0NZvXo1Xbt2ZefOnRgbGwOpSc6RI0fi5eXFkCFDCA0NZc2aNfzxxx8EBARgYWGhbi8pKUn9JXz48OEYGxtjb2+f6XND28c1zfnz5zly5Ajt27cH4IcffqBPnz706NGDDRs20L59e549e8ayZcsYNWqUxhTPU6dO0bNnT8qVK0f//v1RqVRs376dLl26sGHDBry9vTWONWjQIIoWLcqQIUO4fv06W7duxcbGhq+++gqA77//ntGjR+Pt7U3r1q0BXpt4CgoK4tGjR5QpUybTOhcuXFA//qtWrcLGxiZdnaioKBITEzNtI02BAgXSvQZXr15NZGQkffv25eDBg29sQxunT59m3759dOjQAWtra5ycnLh16xYdOnTA1NSUHj16YGBgwObNm+nUqRPr1q2jfPnyGm1MmDBBfR66e/cuGzdu5NGjR6xdu/a1CcWMLFiwgKVLl9KqVSu8vb2Jjo7m6tWrXLt2jRo1atCmTRuePn3KiRMn+P7779/Y3ocffpjucb127RqrV6/O8PF5mTbn8aCgID7//HOioqJo3bo1Li4uBAUFceDAAeLj4zEyMiIkJIS2bdsSFxdHp06dsLa2ZseOHfj7+6sTzC9btGgRhoaGdO/enYSEBAwNDbP8/C9TpozOzptCCCHeIYoQQoi3FhUVpbi7uyv+/v5a71OvXj1l+PDh6vvz5s1T3N3d09Xbtm2b4u7urvz777+KoijKoUOHFHd3d+XPP//MtO3Q0FDF3d1dmTdvXrptXbp0UZo2bao8f/5cXZaSkqK0adNGadiwYbrjtmvXTklKSnpjf/z9/ZUmTZq8ts6rfQkODlbKlCmj9O3bV6Pe/PnzFXd3d42/T9q+Xbt2VVJSUtTlU6ZMUUqXLq1ERkaqy+Li4tIde8yYMUr58uU1+t2rVy+lXr16b4wzzenTpxV3d3fl9OnT6rKOHTsq7u7uysaNGzXqBgQEKJ6ensq5c+c0yjdu3Ki4u7srFy5cUBR
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Médianes comportement ===\n",
" gross_flow_to_aum flow_freq flow_direction_balance n_isin_total avg_holding_months_per_isin exit_rate_per_isin log_aum_qty_mean months_since_last_tx\n",
"cluster_k4 \n",
"0 7.884 0.071 0.000 1.0 11.333 1.000 5.280 69.0\n",
"1 5.348 0.617 -0.006 12.0 28.924 0.667 8.768 3.0\n",
"2 1.159 0.043 -1.000 3.0 60.000 0.400 5.167 27.0\n",
"3 1.477 0.012 -1.000 3.0 12.000 0.714 3.407 127.0\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABB8AAAGGCAYAAAAzaSmEAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XdUFFcbwOHf0sQCSFdRUEHAAoq9K2rs3dh7793YNXaNGiu22HuLPbGbGBNrNLYo9o6KNCkC0ub7g4/VFRZRgZXwPufMgZ25c/edvTuzu3duUSmKoiCEEEIIIYQQQgiRRvR0HYAQQgghhBBCCCH+26TyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQYivVI0aNRg9erSuw0hXu3fvxsXFhWfPnuk6FJFKRo8eTY0aNTTWubi4sHjxYh1F9HGLFy/GxcVF12HozJeUT3pdt+Li4mjYsCHLli1L8+fSlaTOna9JWpzH2j4DVq1aRc2aNSlcuDBNmjRJ1ef8WnXs2JGOHTvqOozPcv78eVxcXDh//nyaPs/QoUMZPHhwmj6HECJ1SeWDEOnsyZMnTJw4kZo1a+Lm5kbJkiVp06YN69evJzIyMl1iiIiIYPHixWn+xSCjO3DgAOvWrdN1GF+tkJAQ3NzccHFx4f79+7oO55PIOZA+7t27x+LFi1O9QvGXX37hxYsXdOjQQb0u4Yfr9evXNdKGhoby7bff4ubmxqlTp1I1jq5du+Li4sKUKVNSNd//uuXLl3P8+PEUpf3rr7+YM2cOJUuWZObMmQwbNiyNo/s0afUeT2ufUgZfq549e3L06FFu3bql61CEECkklQ9CpKOTJ0/SqFEjDh06hKenJxMmTGD48OHkyZOHOXPmMH369HSJIyIiAi8vLy5cuJAuz5dSTZo04dq1a9jZ2ek6FCD+B86GDRt0HcZX6/Dhw6hUKqytrdm/f7+uw/kkyZ0Dffv25dq1azqIKuM7fPgwU6dOVT++d+8eXl5e+Pj4pOrzrF69mgYNGmBiYpJsurCwMLp168bt27fx8vKiatWqqRbD0aNHuXLlSqrll9Fcu3aNvn37fta+K1asSPKHb1KfAefOnUNPT4/p06fTtGlTqlWr9tkxp4W0eo+nNW1lkBrKlCnDtWvXKFOmTJrkn6BIkSIUK1aMNWvWpOnzCCFSj4GuAxAis3j69ClDhw4lT548rF+/HhsbG/W29u3b8/jxY06ePKm7AFNBeHg42bJl++z99fX10dfXT8WIvk4RERFkzZpV12F8sf3791OtWjXy5MnDL7/8wtChQ3UdUqowMDDAwCDjfTx+6fmXGoyMjNL8OW7evMmtW7c+2r0jLCyM7t274+3tjZeXV6r+aH379i2zZs2iR48eLFq0KNXy/drFxcURHR1NlixZyJIlS6rnn9RnQEBAAMbGxuny3hJf5u3btxgaGqKnp5cm74+k1KtXj8WLF/PmzRuyZ8+eLs8phPh80vJBiHSyatUqwsPDmT59ukbFQwIHBwc6d+6sdX9t/dCT6iN7/fp1unfvTrly5XB3d6dGjRqMGTMGgGfPnlGhQgUAvLy8cHFxSdR39/79+wwaNIiyZcvi5uZG8+bNOXHiRJLPe+HCBSZNmkSFChU++uV+48aNNGjQgOLFi1OmTBmaN2/OgQMHkj2WuLg4Fi9eTOXKlSlevDgdO3bk3r17ifqWJ+x76dIlZs6cSfny5SlRogT9+/cnMDBQI47jx4/Tq1cvKleuTLFixahVqxZLliwhNjZWnaZjx46cPHkSHx8f9WuU0P9aW7/kpPq5duzYkYYNG/Lvv//Svn17ihcvzrx58wCIiopi0aJFfPPNNxQrVoxq1aoxe/ZsoqKiNPI9ffo0bdu2pXTp0nh4eFCnTh11Hrry/PlzLl68SP369WnQoAHPnj3jn3/++ez8bt68SY8ePShZsiQeHh507tw5ybvKISEhzJgxgxo1alCsWDGqVq3KyJEj1WUcFRXFwoULad68OaVKlaJEiRK0a9eOc+fOqfP42DmQ1LkWExPDkiVLqFWrFsWKFaNGjRrMmzcvUVnVqFGD3r17c/HiRXVT/5o1a7J3796PvgbPnj3DxcWF1atXs27dOjw9PXF3d6dDhw7cuXNHI+3o0aPx8PDgyZMn9OzZEw8PD0aMGAHEV0LMmjWLatWqUaxYMerUqcPq1atRFEUjj6ioKGbMmEH58uXx8PCgT58+vHz5MlFc2sYeSOp1ev+83L17t7o/dqdOndSvc8L5kdx1KjnHjx/H0NCQ0qVLa03z5s0bevTowY0bN1i8eDHVq1f/aL6fYuXKlSiKQvfu3VMlv+PHj9OwYUPc3Nxo2LAhx44dSzJdXFwc69ato0GDBri5uVGxYkUmTpxIcHCwRrqUvLZxcXGsX7+eRo0a4ebmRvny5enevbtGt5WELiX79+9XP+eff/6p3vb+50bC++H+/fsMHjyYkiVLUq5cOaZNm8bbt2818gwPD2fPnj3q98T775n3r60uLi7s3r2b8PBwddrdu3drfR3fv962adNGfexbt25NlDYgIICxY8dSsWJF3NzcaNy4MXv27EmU7tdff6V58+Z4eHhQsmRJGjVqxPr169XxJvceT86+ffv49ttv1Z+H7du356+//tKa/lM+dx49esTAgQOpVKkSbm5uVK1alaFDhxIaGgokXwYAvr6+jBkzhooVK1KsWDEaNGjAzz//nOTz/vrrr8yfP58qVapQvHhxwsLCkv0svHfvHh07dqR48eJUqVKFlStXJjpWHx8f+vTpQ4kSJahQoQIzZszgzz//TPK1rVixIuHh4Zw5c+ajr7kQQvcy3q0dITKo33//nXz58lGyZMk0fZ6AgAC6d++Oubk5vXr1wtTUlGfPnqm/zFpYWDBp0iQmTZrEN998wzfffAOg/hFx9+5d2rZti62tLT179iRbtmwcOnSI/v37s3jxYnX6BJMnT8bCwoL+/fsTHh6uNa4dO3Ywbdo06tSpQ6dOnXj79i23b9/m6tWrNGrUSOt+P/74I6tWrcLT05MqVapw69YtunfvrvFl9n3Tpk3D1NSUAQMG4OPjw/r165kyZQoLFixQp9mzZw/ZsmWja9euZMuWjXPnzrFo0SLCwsIYNWoUAH369CE0NJSXL1+qv7R/7l2V169f07NnTxo0aEDjxo2xtLQkLi6Ovn37cunSJVq1aoWjoyN37txh/fr1PHr0iKVLlwLx5dG7d29cXFwYNGgQRkZGPH78+It+6KeGX375haxZs+Lp6YmxsTH29vYcOHDgs97fd+/epX379mTPnp0ePXpgYGDA9u3b6dixI5s2baJ48eJA/A/K9u3bc//+fVq0aEGRIkUICgrit99+w9fXFwsLC8LCwti5cycNGzakZcuWvHnzhp9//pkePXqwc+dOChcu/NFzICnjx49nz5491KlTh65du3Lt2jVWrFjB/fv3WbJkiUbax48fM3jwYL799luaNWvGrl27GD16NEWLFqVQoUIffT327t3LmzdvaNeuHW/fvmXjxo107tyZAwcOYGVlpU4XExND9+7dKVWqFKNGjcLY2BhFUejbty/nz5/n22+/pXDhwvz555/Mnj0bX19fxo4dq95/3Lhx7N+/n4YNG1KyZEnOnTtHr169PqnsklOmTBk6duzIxo0b6dOnDwULFgTA0dHxo9ep5Fy+fBlnZ2cMDQ2T3B4REUHPnj35999/WbhwIZ6enonSREVFERYWlqLjsLCw0Hj8/PlzVq5cyYwZMzA2Nk5RHsn566+/GDhwIE5OTgwfPpygoCDGjBlDrly5EqWdOHEie/bsoXnz5nTs2JFnz56xefNmbt68ydatWzE0NEzxaztu3Dh2795N1apV+fbbb4mNjeXixYtcvXoVNzc3dbpz585x6NAh2rdvj7m5+Ue7xQ0ZMgQ7OzuGDx/OlStX2LhxIyEhIcyePRuA2bNnM378eNz
"text/plain": [
"<Figure size 1200x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Médianes allocation ===\n",
" share_asset_fixed_income share_asset_diversified share_asset_equity share_fund_carmignac_patrimoine share_fund_carmignac_investissement share_fund_carmignac_sécurité share_fund_carmignac_emergents\n",
"cluster_k4 \n",
"0 0.767 0.000 0.000 0.000 0.000 0.000 0.000\n",
"1 0.284 0.207 0.155 0.152 0.011 0.018 0.002\n",
"2 0.000 0.372 0.227 0.255 0.000 0.000 0.000\n",
"3 0.000 0.326 0.099 0.169 0.000 0.000 0.000\n"
]
}
],
"source": [
"# ============================================================\n",
"# Feature selection & preprocessing \n",
"# ============================================================\n",
"base_features = [\n",
" \"flow_freq\",\n",
" \"gross_flow_to_aum\",\n",
" \"n_isin_total\",\n",
" \"avg_holding_months_per_isin\",\n",
" \"exit_rate_per_isin\",\n",
" \"flow_direction_balance\",\n",
" \"log_aum_qty_mean\",\n",
" \"months_since_last_tx\",\n",
"]\n",
"all_features = [c for c in base_features if c in dfc.columns]\n",
"\n",
"dfc_clean = dfc.copy()\n",
"\n",
"# --- Diagnostic préalable ---\n",
"vals_ff = dfc_clean[\"flow_freq\"].to_numpy(dtype=float)\n",
"print(f\"flow_freq — min={vals_ff.min():.4f}, max={vals_ff.max():.4f}, \"\n",
" f\"valeurs > 1 : {(vals_ff > 1).sum()}\")\n",
"\n",
"# --- Imputation des NaN par 0 ---\n",
"for col in [\"flow_direction_balance\", \"months_since_last_tx\"]:\n",
" if col in dfc_clean.columns:\n",
" dfc_clean[col] = dfc_clean[col].fillna(0)\n",
"\n",
"# --- Groupe 1 : clip MAD 3 sigma ---\n",
"for col in [\n",
" \"n_isin_total\",\n",
" \"exit_rate_per_isin\",\n",
" \"avg_holding_months_per_isin\",\n",
" \"months_since_last_tx\",\n",
"]:\n",
" if col not in dfc_clean.columns:\n",
" continue\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" med = np.nanmedian(vals)\n",
" mad = np.nanmedian(np.abs(vals - med)) * 1.4826\n",
" if mad > 0:\n",
" dfc_clean[col] = np.clip(vals, med - 3*mad, med + 3*mad)\n",
" else:\n",
" dfc_clean[col] = np.clip(vals, 0, np.nanpercentile(vals, 95))\n",
"\n",
"# --- Groupe 2 : clip p90 puis log-transform ---\n",
"col = \"gross_flow_to_aum\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" vals = np.clip(vals, 0, np.nanpercentile(vals, 90))\n",
" dfc_clean[col] = np.log1p(vals)\n",
"\n",
"col = \"flow_freq\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" dfc_clean[col] = np.log1p(np.clip(vals, 0, None))\n",
"\n",
"# --- Groupe 3 : log_aum_qty_mean — clip MAD 3 sigma ---\n",
"col = \"log_aum_qty_mean\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" med = np.nanmedian(vals)\n",
" mad = np.nanmedian(np.abs(vals - med)) * 1.4826\n",
" dfc_clean[col] = np.clip(vals, med - 3*mad, med + 3*mad)\n",
"\n",
"# --- Groupe 4 : hhi_isin — clip p90 ---\n",
"col = \"hhi_isin\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" dfc_clean[col] = np.clip(vals, 0, np.nanpercentile(vals, 90))\n",
"\n",
"# --- Construction de X ---\n",
"X_num = dfc_clean[all_features].copy()\n",
"X_num = X_num.loc[:, ~X_num.columns.duplicated()]\n",
"X_num = X_num.fillna(X_num.median())\n",
"\n",
"X_cat = pd.get_dummies(\n",
" dfc_clean[[\"country_grp\", \"region_grp\"]].fillna(\"Unknown\"), drop_first=True\n",
")\n",
"X = X_num.reset_index(drop=True)\n",
"\n",
"scaler = RobustScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"# --- Diagnostic ---\n",
"X_df = pd.DataFrame(X_scaled, columns=X.columns)\n",
"extreme = (X_df.abs() > 5).any(axis=1).sum()\n",
"print(f\"Accounts: {X.shape[0]} | Features: {X.shape[1]}\")\n",
"print(f\"Points > 5 std after scaling: {extreme} ({extreme/len(X_df):.1%})\")\n",
"\n",
"extreme_by_feat = (X_df.abs() > 5).sum().sort_values(ascending=False)\n",
"print(\"\\nFeatures with most extreme values (>5 std):\")\n",
"print(extreme_by_feat[extreme_by_feat > 0].to_string())\n",
"\n",
"# --- Clustering K=4 ---\n",
"RESULTS = {}\n",
"for k in [4]:\n",
" km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)\n",
" dfc[f\"cluster_k{k}\"] = km.fit_predict(X_scaled)\n",
" RESULTS[k] = {\n",
" \"model\": km,\n",
" \"silhouette\": silhouette_score(X_scaled, dfc[f\"cluster_k{k}\"]),\n",
" \"davies_bouldin\": davies_bouldin_score(X_scaled, dfc[f\"cluster_k{k}\"]),\n",
" }\n",
" print(f\"\\nK={k} | sil={RESULTS[k]['silhouette']:.4f} | db={RESULTS[k]['davies_bouldin']:.4f}\")\n",
"\n",
"print(\"\\n=== Tailles des clusters ===\")\n",
"counts = dfc[\"cluster_k4\"].value_counts().sort_index()\n",
"props = counts / counts.sum() * 100\n",
"print(pd.DataFrame({\"n_comptes\": counts, \"pct\": props.round(1)}))\n",
"\n",
"# --- Heatmap comportement ---\n",
"profile_vars_behavior = [\n",
" \"gross_flow_to_aum\",\n",
" \"flow_freq\",\n",
" \"flow_direction_balance\",\n",
" \"n_isin_total\",\n",
" \"avg_holding_months_per_isin\",\n",
" \"exit_rate_per_isin\",\n",
" \"log_aum_qty_mean\",\n",
" \"months_since_last_tx\",\n",
"]\n",
"profile_vars_behavior = [c for c in profile_vars_behavior if c in dfc.columns]\n",
"\n",
"prof_behavior = plot_heatmap(\n",
" dfc, profile_vars_behavior, \"cluster_k4\",\n",
" title=\"Cluster signatures — Comportement (K=4, robust z-score)\",\n",
" figsize=(14, 4)\n",
")\n",
"\n",
"print(\"\\n=== Médianes comportement ===\")\n",
"print(prof_behavior.round(3).to_string())\n",
"\n",
"# --- Heatmap allocation (post-clustering, descriptive uniquement) ---\n",
"profile_vars_allocation = [\n",
" c for c in [\n",
" \"share_asset_fixed_income\",\n",
" \"share_asset_diversified\",\n",
" \"share_asset_equity\",\n",
" \"share_fund_carmignac_patrimoine\",\n",
" \"share_fund_carmignac_investissement\",\n",
" \"share_fund_carmignac_sécurité\",\n",
" \"share_fund_carmignac_emergents\",\n",
" ]\n",
" if c in dfc.columns\n",
"]\n",
"\n",
"prof_allocation = plot_heatmap(\n",
" dfc, profile_vars_allocation, \"cluster_k4\",\n",
" title=\"Cluster signatures — Allocation produits (K=4, descriptif post-clustering)\",\n",
" figsize=(12, 4)\n",
")\n",
"\n",
"print(\"\\n=== Médianes allocation ===\")\n",
"print(prof_allocation.round(3).to_string())"
]
},
{
"cell_type": "code",
"execution_count": 199,
"id": "85747735-d0b4-4aa7-9fc2-adf030f92286",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" n_comptes pct\n",
"cluster_k4 \n",
"0 2708 37.7\n",
"1 1174 16.4\n",
"2 1476 20.6\n",
"3 1821 25.4\n"
]
}
],
"source": [
"counts = dfc[\"cluster_k4\"].value_counts().sort_index()\n",
"props = counts / counts.sum() * 100\n",
"print(pd.DataFrame({\"n_comptes\": counts, \"pct\": props.round(1)}))"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "dc171be2-e066-4352-a0ea-32d7b7b046b0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" k inertia silhouette davies_bouldin\n",
" 2 20206.150896 0.422448 0.971224\n",
" 3 16684.870723 0.241253 1.542920\n",
" 4 14655.878131 0.231172 1.510868\n",
" 5 13189.616061 0.228827 1.408857\n",
" 6 11997.575028 0.223735 1.416454\n",
" 7 11089.241350 0.229848 1.419999\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAGGCAYAAACUkchWAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA3K9JREFUeJzs3XdYU+fbB/BvEvaWrYADkA3iRBG1r4p1z7q1rbWt2jpq9WetHWq11dZRq7bOVuuoo1WpVutq6wYHgoLgABdLZMjeSd4/kGgEBGLgML6f6+KqOXlycucm5Zzcec79iORyuRxERERERERERERERFSKWOgAiIiIiIiIiIiIiIhqKxbRiYiIiIiIiIiIiIjKwSI6EREREREREREREVE5WEQnIiIiIiIiIiIiIioHi+hEREREREREREREROVgEZ2IiIiIiIiIiIiIqBwsohMRERERERERERERlYNFdCIiIiIiIiIiIiKicrCITkRERERERERERERUDhbRiYiIiEgQ3bt3x9y5cxW3L168CGdnZ1y8eFGxbfz48ejfv78Q4dV6MpkM/fv3x7p16xTb1qxZA2dnZ6Smplb4+BfzXxXdu3fHpEmTVHpsTdm/fz+cnZ0RFhYmdCivbNeuXXjttddQUFAgdChEREpKjjsNydy5c9G9e/cKx8XGxsLZ2Rn79+9XbKuufF2/fh0eHh6Ii4tTbBs/fjzGjx+v9ueqjapy/lPbLV++HMOHDxc6DCoDi+hEtZSzszPWrFmjuF2fDgq12Yt5JyKiqrt16xamT5+O//u//4Onpye6dOmCCRMmYPv27UKHVi2ioqKwZs0axMbGlrpv586dSh+e1emvv/5CQkICxo0bVy37p9JOnz5d5nlCbm4u1qxZo/QFkDoNHToUhYWF2L17d7Xsn4jqppIvC0t+PD094efnh4kTJ2Lbtm3IysoSOsRqMXfuXKXX7ebmhm7dumHmzJmIiooSOjxBfP/99+jXrx9sbGyEDqXeOHToELZu3Vpqe2JiItasWYPIyMhqed633noLN2/exD///FMt+yfVsYhOVINePMl58Sc0NFToEF/Z9u3b0bZtWxQWFpY7xtnZGV999VUNRqWsvA/ARET06q5evYphw4bh5s2bGD58OL788ksMHz4cYrEY27ZtUxp79OhRLFq0SKBI1ScqKgpr165Vmv1VYteuXThw4EC1PO/PP/+Mfv36wdDQUKXH15f816TTp09j7dq1pbbn5uZi7dq1uHTpUrU8r7a2NgYPHoytW7dCLpdXy3MQUd01ffp0fPfdd1iwYIFi5vE333yDgQMH4ubNm9X63FOmTMH169er9TnKoqWlhe+++w7fffcdFi9ejCFDhiAwMBCjRo1CYmJijcdTWdWRr8jISFy4cAGjRo1S634bur/++qvUuSsAPH78GGvXrq22IrqFhQV69OiBX375pVr2T6rTEDoAooZo+vTpsLW1LbW9adOmAkSjXqdOnULnzp2hqakpdCjlOn36NHbu3Ilp06aVuu/69euQSCQCREVEVD+sX78ehoaG+OOPP2BkZKR0X0pKitJtLS2tmgytXomIiMDNmzdVbscC1P78FxUVQSaT1fo4q1NOTg709PQAAH369MHmzZsRFBSETp06CRwZEdUmXbt2haenp+L2pEmTEBgYiMmTJ+ODDz7AkSNHoKOjUy3PraGhAQ2Nmi8taWhoYNCgQUrbvL29MWnSJJw+fRojRoyo8ZgqozrytW/fPjRp0gTe3t5q3W9Nys/Ph6amJsTihjvX98Vj/owZMxATEwM7OzuBI6MSDffdSSSgrl27YtCgQaV+TE1NhQ7tleTm5uLy5ct47bXXhA6lTDk5ORWO0dbWFuQkkIiovnj48CEcHR1LFdABwMzMTOl2VXpyR0VFYfz48WjVqhW6dOmCTZs2lRqTkpKCefPmwdfXF56enhg4cGCpWeBl9V0Hyu5bCgDR0dGYPn06OnToAE9PTwwdOlTp8tr9+/djxowZAIA333xTcXXZxYsX0b17d9y5cweXLl1SbH++N2lGRga+/vprdOvWDR4eHvD398fGjRshk8kqzMfJkyehqamJdu3alXl/ZmYm5s6di3bt2qFt27b49NNPkZubqzSmrPzfvHkT48aNg5eXF7p27YqffvoJ+/btg7Ozc5ntaq5cuYI33ngDnp6e6NGjBwICAkqNqczrLMn/zz//jK1bt6Jnz57w9PREdHR0hbmoSF5eHr788kv4+PigTZs2mDNnDtLT00uNO336NMaMGQNvb2+0bt0a77//Pu7cuaO4f+7cudi5cycAKF1JGBsbqyhqr127VrH9+aveKnofAc+uWLx06RIWLFiATp06oVu3bor7PTw8YGJiwsu7iahSOnXqhA8++ABxcXE4ePCgYnvJF7A9evSAp6cnOnfujE8//RRPnjxRjDl69Kji79GLdu/eDWdnZ9y+fRtA+T2+//zzTwwdOhReXl7o0KEDZs6ciYSEBKUx9+/fx7Rp09C5c2d4enqia9eumDlzJjIzM1V6zebm5gBQalJUTEyM4m9wq1atMGLECJw6dUppTMnf4BePdeWdN7woIyMDc+fORdu2bdGuXTt88sknZb6OsvJVcqX2yZMn0b9/f3h4eKBfv344c+ZMpV73P//8g44dO0IkElU4tjLnSkOGDMHUqVOVtg0YMADOzs5KVzYcOXIEzs7OSsfqxMREfPrpp/D19VW8jj/++ENpXyU5PXz4ML7//nt06dIFrVq1Ukv7oSdPnmDGjBlo06YNfHx8sHjxYuTn55caV9H7c/z48Th16hTi4uIUx/Xu3bvj4sWLeOONNwAAn376qeK+588fr127hokTJ6Jt27Zo1aoVxo0bh+DgYKXnL3kfREVFYdasWWjfvj3GjBmjuN/X1xcAeMyvZVgpIqpjnjx5goULF+Ls2bPQ1NTEgAED8L///Q/a2tqKMUVFRdiwYQMOHDiAR48ewdLSEv3798fUqVMVs7mWLFmCgIAABAUFKQ62ixYtwo4dO/DZZ5/hzTffBAAkJyejc+fOmD9/vtIf9bIEBgaioKAAXbt2rdJrunjxIt588018//33ePDgAXbt2oUnT56gTZs2+Oqrr9CsWTOl8deuXcPq1asRGhqKoqIieHp6YubMmWjbtq1izJo1a7B27VocPnwY69atw5kzZ2BjYwMXFxfFScLzJy+3bt1SbJs6dapilnpcXBw2bdqEwMBAJCQkQFdXFz4+PpgzZ06ZVxMQETV0NjY2CAkJwe3bt+Hk5KSWfaanp+Pdd9+Fv78/+vTpg2PHjmH58uVwcnJSFBnz8vIwfvx4PHz4EGPHjoWtrS2OHj2KuXPnIiMjA2+99VaVn/fOnTsYPXo0rKys8N5770FPTw9///03PvzwQ6xZswb+/v5o3749xo8fj+3bt2Py5Mmwt7cHADg4OGDevHlYtGgR9PT0MHnyZADPPuDn5uZi3LhxSExMxKhRo9C4cWOEhIRg5cqVSEpKwmefffbS2EJCQuDk5FTulV8fffQRbG1t8fHHHyMiIgK///47TE1N8b///a/cfSYmJiry9P7770NPTw+///57uTPBHzx4gBkzZuCNN97AkCFDsG/fPsydOxfu7u5o2bKlSq9z//79yM/Px4gRI6ClpQVjY2MAqPSaMAYGBqXi/eqrr2BkZISpU6fi3r172LVrF+Lj47F9+3bFOVBAQADmzp0LPz8/zJ49G7m5udi1axfGjBmDAwcOwNbWFiNHjsTjx49x/vx5fPfdd4r9m5qaYsGCBViwYAH8/f3h7+8P4Nl5RmXeR89buHAhTE1N8eGHH5aaAODm5oarV69WKhdERIMGDcLKlStx7tw5xczsCxcuICYmBkOHDoWFhQXu3LmDvXv3IioqCnv37oVIJMJrr72m+FvVoUMHpX0eOXIELVu2fOkxft26dfjhhx/Qp08fvPHGG0hNTcWOHTswduxYBAQEwMjICAUFBZg4cSIKCgowbtw4mJubIzExEadOnUJGRkalWpWVHBtkMhliYmKwfPlymJiY4P/+7/8UY5KTkzFq1Cjk5uZi/PjxaNSoEQ4
"text/plain": [
"<Figure size 1500x400 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"K=4 | sil=0.2312 | db=1.5109\n",
" n_comptes pct\n",
"cluster_k4 \n",
"0 2708 37.7\n",
"1 1174 16.4\n",
"2 1476 20.6\n",
"3 1821 25.4\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABMkAAAGGCAYAAABhZtaKAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XdYU9cbwPFvWCJ7ioKiAgIOUHDi1lptHa3aumfduOpqHXXUXeve2rr3qIp1a9Vq69aqdaC1jloXMmXLur8/KNEIaEAg/PT9PE8ezbnnnvsektwkb845V6UoioIQQgghhBBCCCGEEO8xPV0HIIQQQgghhBBCCCGErkmSTAghhBBCCCGEEEK89yRJJoQQQgghhBBCCCHee5IkE0IIIYQQQgghhBDvPUmSCSGEEEIIIYQQQoj3niTJhBBCCCGEEEIIIcR7T5JkQgghhBBCCCGEEOK9J0kyIYQQQgghhBBCCPHekySZEEIIIYQQQgghhHjvSZJMCCF0pH79+owYMULXYeSp7du34+HhwYMHD3QdihBCS3/++SflypXj4cOHug5F59LOYVeuXNF1KOI/GzdupG7duiQkJOg6FCGEEO8ASZIJIUQOu3//PmPHjuWDDz7Ay8sLX19f2rZty+rVq4mPj8+TGOLi4pg/fz5nzpzJk+P9v9q1axerVq3SdRj50vPnz1m1ahWtWrWiYsWKeHl50ahRIyZMmMDdu3d1HV6ue1efG0uWLOGXX37J0j6zZ8+mSZMmODk5qcs6depE06ZN09U9deoU5cuXp0WLFkRERLxtuGr379/Hy8vrvU1QHTt2jPnz5+s6jHypZcuWJCYmsmnTJl2HIoQQ4h0gSTIhhMhBv/76K82aNWPfvn3Uq1ePMWPGMHToUBwdHZk+fTqTJ0/Okzji4uJYsGABZ8+ezZPjaevTTz/lzz//1PiyrUu7d+9mzZo1ug4j3wkLC6Ndu3ZMnToVW1tbBg4cqE78HjlyhGbNmuk6xFz3rj43li5dmqUkWWBgICdPnqRt27ZvrHvq1Cn69OlDyZIlWblyJVZWVm8RqaYpU6ZgYGCQY+39vzl27BgLFizQdRj5UoECBWjevDmrVq1CURRdhyOEEOL/3Pv7aUMIIXLYv//+y+DBg3F0dGT16tUUKlRIva1Dhw78888//Prrr7oLMAfExsZiYmKS7f319fXR19fPwYjyp7i4OAoWLKjrMLJt5MiRBAYGMm/ePBo1aqSxbdCgQcyePVtHkeW+t32Ov2u2bduGo6MjFSpUeG29s2fP4u/vT4kSJXI8Qfbbb7/x+++/06NHDxYvXpxj7QKkpKSQmJhIgQIFcrRdkX3Pnz/H0NAQPT3tf8v/+OOPWbZsGadPn8bPzy8XoxNCCPGuk5FkQgiRQ5YtW0ZsbCyTJ0/WSJClKV68OF26dMl0//nz5+Ph4ZGuPKN1vK5cuUL37t2pWrUq3t7e1K9fn5EjRwLw4MED9ZeEBQsW4OHhgYeHh8ZUndu3bzNw4ECqVKmCl5cXLVu25PDhwxke9+zZs3z77bf4+flRp06d1/4N1q5dS5MmTShfvjyVK1emZcuW7Nq167V9SUlJYf78+dSsWZPy5cvTqVMn/v7773RrtqXte+HCBaZOnUq1atWoUKEC/fr1IywsTCOOX375hV69elGzZk3KlStHgwYNWLhwIcnJyeo6nTp14tdff+Xhw4fqv1H9+vUzjRPgzJkzeHh4aExjTZt2dvXqVTp06ED58uWZNWsWAAkJCcybN48PP/yQcuXKUadOHb7//vt0a+ecOHGCdu3aUalSJXx8fGjUqJG6jbx2+fJlfv31Vz7//PN0CTIAIyMjhg8frlF26tQp2rdvT4UKFahUqRL+/v7cvn1bo07a8/vu3bsMGzaMihUrUq1aNebMmYOiKDx+/Bh/f398fX2pUaMGK1as0Ng/7W+/d+9eZs2aRY0aNahQoQJ9+vTh8ePH6eLct28fLVu2xNvbm6pVqzJs2DCCgoI06owYMQIfHx/u379Pz5498fHxYdiwYa99boD2j6uHhwcTJkxg3759NG7cGG9vb9q0acPNmzcB2LRpEx9++CFeXl506tQpw7X6Ll++TPfu3alYsSLly5enY8eOXLhwIcO/7T///MOIESOoVKkSFStWZOTIkcTFxWnEExsby44dO9T9etO6iIcPH6ZatWqoVKpM65w/f57evXvj7OzMypUrsba2fm2bWZGYmMjkyZPp3Lkzzs7Ob91e2mPy888/06RJE7y8vPjtt98AuH79Oj169MDX1xcfHx+6dOnCpUuXMmwnPj6esWPHUrVqVXx9ffn666959uxZumNlNEXy1XNbYmIiCxYsoGHDhnh5eVG1alXatWvHiRMngNTn6fr169Vtpt0yk/Z8yOimzTqYbzqPAwQFBTFq1Cj1ObZ+/fqMGzdO4zXw77//qt9nypcvT+vWrdP9UJT2ut6zZw+zZ8+mVq1alC9fnujoaEC75z9AuXLlsLKySvc+JoQQQmSVjCQTQogccvToUYoVK4avr2+uHic0NJTu3btjbW1Nr169sLCw4MGDBxw6dAgAGxsbvv32W7799ls+/PBDPvzwQwD1l6pbt27Rrl07HBwc6NmzJyYmJuzbt49+/foxf/58df0048ePx8bGhn79+hEbG5tpXFu2bGHSpEk0atSIzp078/z5c27evMnly5dfOz1v5syZLFu2jHr16lGrVi1u3LhB9+7def78eYb1J02ahIWFBf379+fhw4esXr2aCRMmMGfOHHWdHTt2YGJiwhdffIGJiQmnT59m3rx5REdHqxM8ffr0ISoqiidPnqgTjKampm/462csIiKCnj170qRJEz755BNsbW1JSUnB39+fCxcu0Lp1a1xdXfnrr79YvXo19+7dY9GiRUDq49G7d288PDwYOHAgRkZG/PPPP/zxxx/ZiuVtHTlyBEidGquNkydP0rNnT4oWLUr//v2Jj49n3bp1tGvXju3bt1O0aFGN+oMHD8bV1ZWhQ4dy7NgxFi9ejJWVFZs2baJatWoMGzaMXbt2MW3aNLy8vKhcubLG/osXL0alUtGzZ09CQ0NZvXo1Xbt2ZefOnRgbGwOpSc6RI0fi5eXFkCFDCA0NZc2aNfzxxx8EBARgYWGhbi8pKUn9JXz48OEYGxtjb2+f6XND28c1zfnz5zly5Ajt27cH4IcffqBPnz706NGDDRs20L59e549e8ayZcsYNWqUxhTPU6dO0bNnT8qVK0f//v1RqVRs376dLl26sGHDBry9vTWONWjQIIoWLcqQIUO4fv06W7duxcbGhq+++gqA77//ntGjR+Pt7U3r1q0BXpt4CgoK4tGjR5QpUybTOhcuXFA//qtWrcLGxiZdnaioKBITEzNtI02BAgXSvQZXr15NZGQkffv25eDBg29sQxunT59m3759dOjQAWtra5ycnLh16xYdOnTA1NSUHj16YGBgwObNm+nUqRPr1q2jfPnyGm1MmDBBfR66e/cuGzdu5NGjR6xdu/a1CcWMLFiwgKVLl9KqVSu8vb2Jjo7m6tWrXLt2jRo1atCmTRuePn3KiRMn+P7779/Y3ocffpjucb127RqrV6/O8PF5mTbn8aCgID7//HOioqJo3bo1Li4uBAUFceDAAeLj4zEyMiIkJIS2bdsSFxdHp06dsLa2ZseOHfj7+6sTzC9btGgRhoaGdO/enYSEBAwNDbP8/C9TpozOzptCCCHeIYoQQoi3FhUVpbi7uyv+/v5a71OvXj1l+PDh6vvz5s1T3N3d09Xbtm2b4u7urvz777+KoijKoUOHFHd3d+XPP//MtO3Q0FDF3d1dmTdvXrptXbp0UZo2bao8f/5cXZaSkqK0adNGadiwYbrjtmvXTklKSnpjf/z9/ZUmTZq8ts6rfQkODlbKlCmj9O3bV6Pe/PnzFXd3d42/T9q+Xbt2VVJSUtTlU6ZMUUqXLq1ERkaqy+Li4tIde8yYMUr58uU1+t2rVy+lXr16b4wzzenTpxV3d3fl9OnT6rKOHTsq7u7uysaNGzXqBgQEKJ6ensq5c+c0yjdu3Ki4u7srFy5cUBR
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Médianes comportement K=4 ===\n",
" gross_flow_to_aum flow_freq flow_direction_balance n_isin_total avg_holding_months_per_isin exit_rate_per_isin log_aum_qty_mean months_since_last_tx\n",
"cluster_k4 \n",
"0 7.884 0.071 0.000 1.0 11.333 1.000 5.280 69.0\n",
"1 5.348 0.617 -0.006 12.0 28.924 0.667 8.768 3.0\n",
"2 1.159 0.043 -1.000 3.0 60.000 0.400 5.167 27.0\n",
"3 1.477 0.012 -1.000 3.0 12.000 0.714 3.407 127.0\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABB8AAAGGCAYAAAAzaSmEAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XdUFFcbwOHf0sQCSFdRUEHAAoq9K2rs3dh7793YNXaNGiu22HuLPbGbGBNrNLYo9o6KNCkC0ub7g4/VFRZRgZXwPufMgZ25c/edvTuzu3duUSmKoiCEEEIIIYQQQgiRRvR0HYAQQgghhBBCCCH+26TyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQQghhBBCCCGEEGlKKh+EEEIIIYQQQgiRpqTyQYivVI0aNRg9erSuw0hXu3fvxsXFhWfPnuk6FJFKRo8eTY0aNTTWubi4sHjxYh1F9HGLFy/GxcVF12HozJeUT3pdt+Li4mjYsCHLli1L8+fSlaTOna9JWpzH2j4DVq1aRc2aNSlcuDBNmjRJ1ef8WnXs2JGOHTvqOozPcv78eVxcXDh//nyaPs/QoUMZPHhwmj6HECJ1SeWDEOnsyZMnTJw4kZo1a+Lm5kbJkiVp06YN69evJzIyMl1iiIiIYPHixWn+xSCjO3DgAOvWrdN1GF+tkJAQ3NzccHFx4f79+7oO55PIOZA+7t27x+LFi1O9QvGXX37hxYsXdOjQQb0u4Yfr9evXNdKGhoby7bff4ubmxqlTp1I1jq5du+Li4sKUKVNSNd//uuXLl3P8+PEUpf3rr7+YM2cOJUuWZObMmQwbNiyNo/s0afUeT2ufUgZfq549e3L06FFu3bql61CEECkklQ9CpKOTJ0/SqFEjDh06hKenJxMmTGD48OHkyZOHOXPmMH369HSJIyIiAi8vLy5cuJAuz5dSTZo04dq1a9jZ2ek6FCD+B86GDRt0HcZX6/Dhw6hUKqytrdm/f7+uw/kkyZ0Dffv25dq1azqIKuM7fPgwU6dOVT++d+8eXl5e+Pj4pOrzrF69mgYNGmBiYpJsurCwMLp168bt27fx8vKiatWqqRbD0aNHuXLlSqrll9Fcu3aNvn37fta+K1asSPKHb1KfAefOnUNPT4/p06fTtGlTqlWr9tkxp4W0eo+nNW1lkBrKlCnDtWvXKFOmTJrkn6BIkSIUK1aMNWvWpOnzCCFSj4GuAxAis3j69ClDhw4lT548rF+/HhsbG/W29u3b8/jxY06ePKm7AFNBeHg42bJl++z99fX10dfXT8WIvk4RERFkzZpV12F8sf3791OtWjXy5MnDL7/8wtChQ3UdUqowMDDAwCDjfTx+6fmXGoyMjNL8OW7evMmtW7c+2r0jLCyM7t274+3tjZeXV6r+aH379i2zZs2iR48eLFq0KNXy/drFxcURHR1NlixZyJIlS6rnn9RnQEBAAMbGxuny3hJf5u3btxgaGqKnp5cm74+k1KtXj8WLF/PmzRuyZ8+eLs8phPh80vJBiHSyatUqwsPDmT59ukbFQwIHBwc6d+6sdX9t/dCT6iN7/fp1unfvTrly5XB3d6dGjRqMGTMGgGfPnlGhQgUAvLy8cHFxSdR39/79+wwaNIiyZcvi5uZG8+bNOXHiRJLPe+HCBSZNmkSFChU++uV+48aNNGjQgOLFi1OmTBmaN2/OgQMHkj2WuLg4Fi9eTOXKlSlevDgdO3bk3r17ifqWJ+x76dIlZs6cSfny5SlRogT9+/cnMDBQI47jx4/Tq1cvKleuTLFixahVqxZLliwhNjZWnaZjx46cPHkSHx8f9WuU0P9aW7/kpPq5duzYkYYNG/Lvv//Svn17ihcvzrx58wCIiopi0aJFfPPNNxQrVoxq1aoxe/ZsoqKiNPI9ffo0bdu2pXTp0nh4eFCnTh11Hrry/PlzLl68SP369WnQoAHPnj3jn3/++ez8bt68SY8ePShZsiQeHh507tw5ybvKISEhzJgxgxo1alCsWDGqVq3KyJEj1WUcFRXFwoULad68OaVKlaJEiRK0a9eOc+fOqfP42DmQ1LkWExPDkiVLqFWrFsWKFaNGjRrMmzcvUVnVqFGD3r17c/HiRXVT/5o1a7J3796PvgbPnj3DxcWF1atXs27dOjw9PXF3d6dDhw7cuXNHI+3o0aPx8PDgyZMn9OzZEw8PD0aMGAHEV0LMmjWLatWqUaxYMerUqcPq1atRFEUjj6ioKGbMmEH58uXx8PCgT58+vHz5MlFc2sYeSOp1ev+83L17t7o/dqdOndSvc8L5kdx1KjnHjx/H0NCQ0qVLa03z5s0bevTowY0bN1i8eDHVq1f/aL6fYuXKlSiKQvfu3VMlv+PHj9OwYUPc3Nxo2LAhx44dSzJdXFwc69ato0GDBri5uVGxYkUmTpxIcHCwRrqUvLZxcXGsX7+eRo0a4ebmRvny5enevbtGt5WELiX79+9XP+eff/6p3vb+50bC++H+/fsMHjyYkiVLUq5cOaZNm8bbt2818gwPD2fPnj3q98T775n3r60uLi7s3r2b8PBwddrdu3drfR3fv962adNGfexbt25NlDYgIICxY8dSsWJF3NzcaNy4MXv27EmU7tdff6V58+Z4eHhQsmRJGjVqxPr169XxJvceT86+ffv49ttv1Z+H7du356+//tKa/lM+dx49esTAgQOpVKkSbm5uVK1alaFDhxIaGgokXwYAvr6+jBkzhooVK1KsWDEaNGjAzz//nOTz/vrrr8yfP58qVapQvHhxwsLCkv0svHfvHh07dqR48eJUqVKFlStXJjpWHx8f+vTpQ4kSJahQoQIzZszgzz//TPK1rVixIuHh4Zw5c+ajr7kQQvcy3q0dITKo33//nXz58lGyZMk0fZ6AgAC6d++Oubk5vXr1wtTUlGfPnqm/zFpYWDBp0iQmTZrEN998wzfffAOg/hFx9+5d2rZti62tLT179iRbtmwcOnSI/v37s3jxYnX6BJMnT8bCwoL+/fsTHh6uNa4dO3Ywbdo06tSpQ6dOnXj79i23b9/m6tWrNGrUSOt+P/74I6tWrcLT05MqVapw69YtunfvrvFl9n3Tpk3D1NSUAQMG4OPjw/r165kyZQoLFixQp9mzZw/ZsmWja9euZMuWjXPnzrFo0SLCwsIYNWoUAH369CE0NJSXL1+qv7R/7l2V169f07NnTxo0aEDjxo2xtLQkLi6Ovn37cunSJVq1aoWjoyN37txh/fr1PHr0iKVLlwLx5dG7d29cXFwYNGgQRkZGPH78+It+6KeGX375haxZs+Lp6YmxsTH29vYcOHDgs97fd+/epX379mTPnp0ePXpgYGDA9u3b6dixI5s2baJ48eJA/A/K9u3bc//+fVq0aEGRIkUICgrit99+w9fXFwsLC8LCwti5cycNGzakZcuWvHnzhp9//pkePXqwc+dOChcu/NFzICnjx49nz5491KlTh65du3Lt2jVWrFjB/fv3WbJkiUbax48fM3jwYL799luaNWvGrl27GD16NEWLFqVQoUIffT327t3LmzdvaNeuHW/fvmXjxo107tyZAwcOYGVlpU4XExND9+7dKVWqFKNGjcLY2BhFUejbty/nz5/n22+/pXDhwvz555/Mnj0bX19fxo4dq95/3Lhx7N+/n4YNG1KyZEnOnTtHr169PqnsklOmTBk6duzIxo0b6dOnDwULFgTA0dHxo9ep5Fy+fBlnZ2cMDQ2T3B4REUHPnj35999/WbhwIZ6enonSREVFERYWlqLjsLCw0Hj8/PlzVq5cyYwZMzA2Nk5RHsn566+/GDhwIE5OTgwfPpygoCDGjBlDrly5EqWdOHEie/bsoXnz5nTs2JFnz56xefNmbt68ydatWzE0NEzxaztu3Dh2795N1apV+fbbb4mNjeXixYtcvXoVNzc3dbpz585x6NAh2rdvj7m5+Ue7xQ0ZMgQ7OzuGDx/OlStX2LhxIyEhIcyePRuA2bNnM378eNz
"text/plain": [
"<Figure size 1200x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Médianes allocation K=4 ===\n",
" share_asset_fixed_income share_asset_diversified share_asset_equity share_fund_carmignac_patrimoine share_fund_carmignac_investissement share_fund_carmignac_sécurité share_fund_carmignac_emergents\n",
"cluster_k4 \n",
"0 0.767 0.000 0.000 0.000 0.000 0.000 0.000\n",
"1 0.284 0.207 0.155 0.152 0.011 0.018 0.002\n",
"2 0.000 0.372 0.227 0.255 0.000 0.000 0.000\n",
"3 0.000 0.326 0.099 0.169 0.000 0.000 0.000\n",
"\n",
"=== Distribution par pays (top 10) ===\n",
"country_grp Belgium FRANCE France Italy Latam Luxembourg Other Spain Switzerland United Kingdom Us Offshore\n",
"cluster_k4 \n",
"0 1.5 0.1 17.1 5.0 12.5 6.1 3.0 5.0 3.6 6.6 39.4\n",
"1 3.9 8.3 32.1 12.7 0.8 4.6 17.7 6.7 4.5 8.3 0.3\n",
"2 0.9 0.5 69.6 2.8 10.0 1.4 3.9 1.6 2.2 6.7 0.5\n",
"3 2.0 0.0 53.0 4.2 2.9 5.1 7.2 2.1 7.3 10.8 5.3\n",
"\n",
"=== Distribution par région ===\n",
"region_grp Belgium FRANCE France Germany Italy LATAM Luxembourg Other Spain Switzerland United Kingdom\n",
"cluster_k4 \n",
"0 1.5 0.1 17.2 0.6 5.0 52.0 6.1 1.9 5.2 3.6 6.7\n",
"1 3.9 8.4 32.5 1.2 12.7 1.2 4.6 15.5 7.2 4.5 8.3\n",
"2 0.9 0.5 69.8 0.9 2.8 10.6 1.4 2.5 1.7 2.2 6.8\n",
"3 2.0 0.0 53.0 4.3 4.2 8.2 5.1 2.6 2.3 7.3 10.9\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABrcAAAGGCAYAAADRitpgAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XVUFOsfBvCHNghJAxQsQEHEFuy49lWueW2v3d36EwMVW7EDUezu7rp2B7ZSIi0NC+z+/uC6uBIuyLK77PM5h3Pc2ZnZ7zjBPLzvvqMmEolEICIiIiIiIiIiIiIiIlIC6vIugIiIiIiIiIiIiIiIiEhabNwiIiIiIiIiIiIiIiIipcHGLSIiIiIiIiIiIiIiIlIabNwiIiIiIiIiIiIiIiIipcHGLSIiIiIiIiIiIiIiIlIabNwiIiIiIiIiIiIiIiIipcHGLSIiIiIiIiIiIiIiIlIabNwiIiIiIiIiIiIiIiIipcHGLSIiIiIiIiIiIiIiIlIabNwiIiIiIiIiIiJSEK9fv8bq1asRFBQk71KIiIgUFhu3iFTY6tWrYWNjky+f1bt3b/Tu3Vv8+u7du7CxscHZs2fz5fOnTp2Kpk2b5stnUd6Ii4uDk5MTjh8/Lu9SFMLP55AyuX79OqpVq4aIiAh5l0JERESkcJjL6EcxMTEYOXIkoqKiULJkyVytIyAgADY2Njh8+HAeVye9QYMGYebMmXL7fEWSn+d4XouMjISjoyOuXbsm71KIiDJg4xZRAXH48GHY2NiIf6pUqYL69etjwIAB8Pb2RmxsbJ58TnBwMFavXg0fH588WV9eUuTaKOe8vb1RtGhRtG3bVjzt/fv36NGjB6pVq4aOHTvi8ePHGZbz8vJC27ZtkZKSkp/lKr0NGzbg4sWLMll3w4YNUaZMGWzcuFEm6yciIiJSFMxlil2bMpg2bRoqV66M6dOny7uUXHv48CFu3bqFQYMGiadFR0djwoQJqFWrFpo1a4YDBw5kWO758+eoWrUq/P3987NcpXfixAls27ZNJus2NDRE586dsWrVKpmsn4jod7Bxi6iAGT16NBYvXozZs2eLe+QtWLAA7du3x+vXryXmHTZsGJ49e5aj9YeEhGDNmjU5Diqenp7w9PTM0TI5lV1t8+bNy7feiPT7kpOT4e3tjS5dukBDQwMAkJqaipEjR0IoFGLy5MkwNjbG8OHDJf5AEB4ejrVr12LatGnQ1NSUV/lKaePGjTJr3AKAbt26Yd++fXn2Bx0iIiIiRcZcxlyWGwEBAbC3t8eSJUugrp77P9mZm5vj2bNn6NChQx5WJz1PT084OTnB0tJSPG3RokW4d+8eRo0ahSZNmuB///sfHj16JH5fJBJh/vz56Nu3L0qXLi2PspXWyZMn4e3tLbP1d+/eHS9fvsTt27dl9hlERLnBxi2iAqZhw4bo0KEDOnXqhCFDhsDT0xNeXl4IDw/H8OHDkZiYKJ5XU1MTOjo6Mq0nISEBAKCtrQ1tbW2ZflZ2tLS05Pr5shYfHy/vEvLU1atXERERgdatW4unff78GZ8+fcLy5cvRvXt3eHh4ID4+XuLbW8uXL0fNmjVRv359eZSdpZSUFAgEAnmXke+SkpIgFAoBAC1btoRAIOAfM4iIiEglMJdljrks+/ktLCwwdOjQ3z4e1NTUoKOjI+4omJ/Cw8Nx7do1tGrVSmL61atXMWHCBPTp0wczZ85ErVq1cOXKFfH7x48fR2BgIIYMGZLfJWdLJBJJnK+qQigUIikpCQBQvnx5WFtb48iRI3KuiohIEhu3iFSAk5MThg8fjsDAQInnF2U27vOtW7fQvXt31KxZE9WqVUPLli2xfPlyAGnjsXfu3BlA2lAJ34fa+D6Od+/evdGuXTu8ePECPXv2RNWqVcXLZvW8IKFQiOXLl6NevXpwdHTE0KFDMzw0t2nTppg6dWqGZX9c569qy2xs9/j4eLi7u6NRo0awt7dHy5Yt4enpCZFIJDGfjY0N5s6di4sXL6Jdu3awt7dH27Ztcf369ez+28V12djY4PTp07/czgcPHmD06NFo3Lgx7O3t0ahRIyxYsCDDjfTUqVNRrVo1+Pn5YdCgQahWrRomTpyYZQ3f9/OHDx8wZswYVK9eHXXq1IGbm5v4ZvW7Q4cOoU+fPnBycoK9vT3atGmD3bt3S8wzZcoU1KlTB8nJyRk+q3///mjZsqX4dXbHU3YuXrwIc3NzlClTRjzte636+voAgMKFC6NQoULi/5+XL1/ixIkTmDZt2i/X/6Mfj9u///4bDg4OaNq0Kfbs2SMxn0AgwKpVq9CxY0fUqFEDjo6O6NGjB+7cuSMx3/fx7T09PbFt2zY0b94cVapUwYcPH7Kt49ixY+jcuTOqVq2KWrVqoWfPnrh582aW838f8iYgIEBi+vdj7u7du+Jpnz9/xqhRo1CvXj1UqVIFDRs2xLhx4xATEwMg7RiPj4/HkSNHxOfOj+dccHAwpk2bBmdnZ/Hxf/DgwUw/99SpU1ixYgUaNGiAqlWrir+pZWxsDBsbG1y6dCnb/wciIiKigoq5jLnMxsYG79+/Fw/P16NHD/H7x44dQ8eOHeHg4IDatWtj3LhxGWoDgF27dqFZs2ZwcHBA586d8eDBgwz7Natnbt2+fRs9evSAo6MjatasiWHDhmXIKd/r9PX1xdSpU1GzZk3UqFED06ZNEzeSZufq1atISUmBs7OzxPTExERxlgMAAwMD8fri4+OxbNkyTJgwAUWLFv3lZ/xca15lXSDtOB8yZAhu3Lgh3h979+7Nto6nT59i0KBBqFWrFhwdHfHnn39i+/btWc6f3TPRbGxssHr1avHr2NhYzJ8/H02bNoW9vT2cnJzwzz//4OXLlwDSzr+rV68iMDBQfL79eI4JBAJ4eHjgjz/+EB/PixcvztD58vv5dfz4cbRt2xZVqlTBjRs3xO87OzvjypUrGc5LIiJ54phNRCqiQ4cOWL58OW7evImuXbtmOs+7d+8wZMgQ2NjYYPTo0dDW1oavr694qIDy5ctj9OjR8PDwQLdu3VCjRg0AQPXq1cXr+PbtGwYNGoS2bduiffv2MDY2zrau9evXQ01NDYMGDUJ4eDi2b9+Ofv364dixYyhUqJDU2ydNbT8SiUQYNmyYOHxVqlQJN27cwOLFixEcHJxhfPOHDx/i/Pnz6NGjB4oWLYodO3Zg9OjRuHLlCgwNDX9ZnzTbefbsWSQmJqJ79+4oVqwYnj17hp07d+Lr16/w8PCQWF9KSgoGDBiAGjVqYMqUKVL9X40dOxbm5uaYMGECnjx5gh07diA6OhqLFy8Wz7Nnzx5UrFgRTZs2haamJq5cuYI5c+ZAJBKhZ8+eANKOpaNHj+LmzZto0qSJeNnQ0FDcuXMHI0aMAPDr4yk7jx8/hp2dncQ0Kysr6OnpYc2aNejduzfOnDmD2NhYVK5cGQDg5uaGnj17Sgx9Ia2oqCgMHjwYrVu3Rtu2bXHmzBnMnj0bWlpa4nAeGxuLAwcOoF27dujSpQvi4uJw8OBBDBw4EAcOHEClSpUk1nn48GEkJSWha9eu0NbWhoGBQZafv2bNGqxevRrVqlXD6NGjoaWlhadPn+LOnTu//S00gUCAAQMGQCAQoFevXjAxMUFwcDCuXr2K6Oho6OnpYfHixZg5cyYcHBzE14fvDYthYWHo2rUr1NTU0LNnTxgZGeH69euYMWMGYmNj0a9fP4nPW7duHbS0tMSfqaWlJX7Pzs5OpkMfEhERESk65jJJqpjLxowZA0tLS4wbN07cULB+/XqsWrUKrVu3RufOnREREYGdO3eiZ8+eOHr0qLhRaPfu3Zg7dy5q1qyJfv36ITAwECNGjIC+vj5KlCiR7ef++++/GDRoECwsLDBy5EgkJiZi586d6N69Ow4fPgwLCwuJ+ceOHQsLCwuMHz8er169woEDB2BkZIRJkyZl+zmPHz9GsWLFYG5uLjG9SpUq8PLyQrly5eDv748bN25g3rx5ANKGSC9evHiuh1HMq6z73adPnzBhwgR069YNXbt2RdmyZbP87Fu3bmHIkCEwMzNDnz59YGJigg8fPuDq1av
"text/plain": [
"<Figure size 1800x400 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ============================================================\n",
"# K-selection diagnostics\n",
"# ============================================================\n",
"rows = []\n",
"for k in range(2, 8):\n",
" km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)\n",
" labels = km.fit_predict(X_scaled)\n",
" rows.append({\n",
" \"k\": k,\n",
" \"inertia\": km.inertia_,\n",
" \"silhouette\": silhouette_score(X_scaled, labels),\n",
" \"davies_bouldin\": davies_bouldin_score(X_scaled, labels),\n",
" })\n",
"df_kdiag = pd.DataFrame(rows)\n",
"print(df_kdiag.to_string(index=False))\n",
"\n",
"fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
"for ax, col, title in zip(axes,\n",
" [\"inertia\", \"silhouette\", \"davies_bouldin\"],\n",
" [\"Elbow / Inertia\", \"Silhouette (higher=better)\", \"Davies-Bouldin (lower=better)\"]):\n",
" ax.plot(df_kdiag[\"k\"], df_kdiag[col], marker=\"o\")\n",
" ax.set_title(title)\n",
" ax.set_xlabel(\"K\")\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# ============================================================\n",
"# Clustering K=4\n",
"# ============================================================\n",
"RESULTS = {}\n",
"for k in [4]:\n",
" km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)\n",
" dfc[f\"cluster_k{k}\"] = km.fit_predict(X_scaled)\n",
" RESULTS[k] = {\n",
" \"model\": km,\n",
" \"silhouette\": silhouette_score(X_scaled, dfc[f\"cluster_k{k}\"]),\n",
" \"davies_bouldin\": davies_bouldin_score(X_scaled, dfc[f\"cluster_k{k}\"]),\n",
" }\n",
" print(f\"K={k} | sil={RESULTS[k]['silhouette']:.4f} | db={RESULTS[k]['davies_bouldin']:.4f}\")\n",
" counts = dfc[f\"cluster_k{k}\"].value_counts().sort_index()\n",
" props = counts / counts.sum() * 100\n",
" print(pd.DataFrame({\"n_comptes\": counts, \"pct\": props.round(1)}))\n",
" print()\n",
"\n",
"# ============================================================\n",
"# Heatmap comportement\n",
"# ============================================================\n",
"prof_behavior = plot_heatmap(\n",
" dfc, profile_vars_behavior, \"cluster_k4\",\n",
" title=\"Cluster signatures — Comportement (K=4, robust z-score)\",\n",
" figsize=(14, 4)\n",
")\n",
"print(\"\\n=== Médianes comportement K=4 ===\")\n",
"print(prof_behavior.round(3).to_string())\n",
"\n",
"# ============================================================\n",
"# Heatmap allocation (descriptif post-clustering)\n",
"# ============================================================\n",
"prof_allocation = plot_heatmap(\n",
" dfc, profile_vars_allocation, \"cluster_k4\",\n",
" title=\"Cluster signatures — Allocation produits (K=4, descriptif post-clustering)\",\n",
" figsize=(12, 4)\n",
")\n",
"print(\"\\n=== Médianes allocation K=4 ===\")\n",
"print(prof_allocation.round(3).to_string())\n",
"\n",
"# ============================================================\n",
"# Description géographique post-clustering\n",
"# ============================================================\n",
"print(\"\\n=== Distribution par pays (top 10) ===\")\n",
"geo_country = pd.crosstab(\n",
" dfc[\"cluster_k4\"],\n",
" dfc[\"country_grp\"].fillna(\"Unknown\"),\n",
" normalize=\"index\"\n",
").round(3) * 100\n",
"print(geo_country.to_string())\n",
"\n",
"print(\"\\n=== Distribution par région ===\")\n",
"geo_region = pd.crosstab(\n",
" dfc[\"cluster_k4\"],\n",
" dfc[\"region_grp\"].fillna(\"Unknown\"),\n",
" normalize=\"index\"\n",
").round(3) * 100\n",
"print(geo_region.to_string())\n",
"\n",
"# Visualisation heatmap géographique\n",
"fig, axes = plt.subplots(1, 2, figsize=(18, 4))\n",
"\n",
"sns.heatmap(\n",
" geo_country,\n",
" cmap=\"Blues\", annot=True, fmt=\".1f\",\n",
" ax=axes[0],\n",
" cbar_kws={\"label\": \"%\"}\n",
")\n",
"axes[0].set_title(\"Distribution par pays (% par cluster)\")\n",
"axes[0].set_xlabel(\"Pays\")\n",
"axes[0].set_ylabel(\"Cluster\")\n",
"\n",
"sns.heatmap(\n",
" geo_region,\n",
" cmap=\"Blues\", annot=True, fmt=\".1f\",\n",
" ax=axes[1],\n",
" cbar_kws={\"label\": \"%\"}\n",
")\n",
"axes[1].set_title(\"Distribution par région (% par cluster)\")\n",
"axes[1].set_xlabel(\"Région\")\n",
"axes[1].set_ylabel(\"Cluster\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "50ecf35e-de7e-44ae-afee-404186c4d42c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" flow_freq gross_flow_to_aum n_isin_total \\\n",
"flow_freq 1.000 0.043 0.484 \n",
"gross_flow_to_aum 0.043 1.000 0.087 \n",
"n_isin_total 0.484 0.087 1.000 \n",
"avg_holding_months_per_isin 0.074 -0.019 0.027 \n",
"exit_rate_per_isin -0.066 -0.029 -0.103 \n",
"flow_direction_balance 0.182 0.007 0.038 \n",
"log_aum_qty_mean 0.522 -0.047 0.381 \n",
"months_since_last_tx -0.513 -0.012 -0.229 \n",
"\n",
" avg_holding_months_per_isin exit_rate_per_isin \\\n",
"flow_freq 0.074 -0.066 \n",
"gross_flow_to_aum -0.019 -0.029 \n",
"n_isin_total 0.027 -0.103 \n",
"avg_holding_months_per_isin 1.000 -0.257 \n",
"exit_rate_per_isin -0.257 1.000 \n",
"flow_direction_balance -0.163 0.093 \n",
"log_aum_qty_mean 0.140 0.024 \n",
"months_since_last_tx -0.306 0.159 \n",
"\n",
" flow_direction_balance log_aum_qty_mean \\\n",
"flow_freq 0.182 0.522 \n",
"gross_flow_to_aum 0.007 -0.047 \n",
"n_isin_total 0.038 0.381 \n",
"avg_holding_months_per_isin -0.163 0.140 \n",
"exit_rate_per_isin 0.093 0.024 \n",
"flow_direction_balance 1.000 0.298 \n",
"log_aum_qty_mean 0.298 1.000 \n",
"months_since_last_tx -0.119 -0.389 \n",
"\n",
" months_since_last_tx \n",
"flow_freq -0.513 \n",
"gross_flow_to_aum -0.012 \n",
"n_isin_total -0.229 \n",
"avg_holding_months_per_isin -0.306 \n",
"exit_rate_per_isin 0.159 \n",
"flow_direction_balance -0.119 \n",
"log_aum_qty_mean -0.389 \n",
"months_since_last_tx 1.000 \n"
]
}
],
"source": [
"# Test : corrélations entre features\n",
"corr_matrix = dfc[base_features].corr().round(3)\n",
"print(corr_matrix)"
]
},
{
"cell_type": "code",
"execution_count": 202,
"id": "273392b8-c60c-4b19-ab4e-760616d3c246",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"base_features: 8\n",
"share_asset: []\n",
"share_fund: []\n",
"X columns: 8\n",
"X_num columns: 8\n",
"X_cat columns: 20\n"
]
}
],
"source": [
"print(\"base_features:\", len(base_features))\n",
"print(\"share_asset:\", [c for c in all_features if c.startswith(\"share_asset_\")])\n",
"print(\"share_fund:\", [c for c in all_features if c.startswith(\"share_fund_\")])\n",
"print(\"X columns:\", X.shape[1])\n",
"print(\"X_num columns:\", X_num.shape[1])\n",
"print(\"X_cat columns:\", X_cat.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": 203,
"id": "d42b5319-c66c-4a7f-aeac-d3044d07f499",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== share_fund_ disponibles ===\n",
"['share_fund_carmignac_court_terme', 'share_fund_carmignac_emergents', 'share_fund_carmignac_investissement', 'share_fund_carmignac_patrimoine', 'share_fund_carmignac_portfolio_credit', 'share_fund_carmignac_portfolio_flexible_b', 'share_fund_carmignac_portfolio_global_bon', 'share_fund_carmignac_portfolio_patrimoine', 'share_fund_carmignac_portfolio_sécurité', 'share_fund_carmignac_sécurité']\n",
"\n",
"=== share_asset_ disponibles ===\n",
"['share_asset_alternative', 'share_asset_diversified', 'share_asset_equity', 'share_asset_fixed_income', 'share_asset_private_assets']\n",
"share_fund_carmignac_court_terme: 0.000000\n",
"share_fund_carmignac_emergents: 0.000001\n",
"share_fund_carmignac_investissement: 0.000032\n",
"share_fund_carmignac_patrimoine: 0.011248\n",
"share_fund_carmignac_portfolio_credit: 0.000000\n",
"share_fund_carmignac_portfolio_flexible_b: 0.000000\n",
"share_fund_carmignac_portfolio_global_bon: 0.000000\n",
"share_fund_carmignac_portfolio_patrimoine: 0.000000\n",
"share_fund_carmignac_portfolio_sécurité: 0.000000\n",
"share_fund_carmignac_sécurité: 0.000080\n",
"share_asset_alternative: 0.000000\n",
"share_asset_diversified: 0.027594\n",
"share_asset_equity: 0.009158\n",
"share_asset_fixed_income: 0.130769\n",
"share_asset_private_assets: 0.000000\n"
]
}
],
"source": [
"# Identifier les share_fund_ et share_asset_ les plus discriminantes\n",
"top_share_funds = [c for c in dfc.columns if c.startswith(\"share_fund_\")]\n",
"top_share_assets = [c for c in dfc.columns if c.startswith(\"share_asset_\")]\n",
"\n",
"# Regarder lesquelles varient le plus entre clusters\n",
"print(\"=== share_fund_ disponibles ===\")\n",
"print(top_share_funds)\n",
"print(\"\\n=== share_asset_ disponibles ===\")\n",
"print(top_share_assets)\n",
"\n",
"# Variance inter-cluster pour chaque share variable\n",
"for col in top_share_funds + top_share_assets:\n",
" var_inter = dfc.groupby(\"cluster_k4\")[col].median().var()\n",
" print(f\"{col}: {var_inter:.6f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7de1a58e-37ec-4d13-9807-5b047ec6ff42",
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"subprocess.run([\"jupyter\", \"nbconvert\", \"--to\", \"html\", \"clustering_clean.ipynb\"])"
]
},
{
"cell_type": "code",
"execution_count": 206,
"id": "b56a53c8-c1eb-4117-a028-ba9b6c1c45af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Types d'assets disponibles ===\n",
"Product - Asset Type\n",
"Equity 2023860\n",
"Diversified 1377424\n",
"Fixed Income 948728\n",
"Alternative 214235\n",
"Private Assets 219\n",
"Name: count, dtype: int64\n",
"\n",
"df_month_asset shape: (1745996, 15)\n",
"df_client_asset shape: (13328, 20)\n",
"\n",
"Comptes par asset type:\n",
"Product - Asset Type\n",
"Diversified 4161\n",
"Fixed Income 3934\n",
"Equity 3901\n",
"Alternative 1319\n",
"Private Assets 13\n",
"Name: Registrar Account - ID, dtype: int64\n",
"\n",
"Asset types retenus (>= 50 comptes) : ['Alternative', 'Diversified', 'Equity', 'Fixed Income']\n",
"\n",
"============================================================\n",
"ASSET TYPE : Alternative\n",
"============================================================\n",
" k silhouette davies_bouldin\n",
" 2 0.4568 0.9959\n",
" 3 0.3416 1.1374\n",
" 4 0.2564 1.3750\n",
" 5 0.2815 1.2393\n",
" 6 0.2640 1.3549\n",
"→ K retenu : 2 (silhouette=0.4568)\n",
" n_comptes pct\n",
"cluster_alternative \n",
"0 311 23.6\n",
"1 1008 76.4\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABOQAAAGGCAYAAADbxV7qAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA5opJREFUeJzs3XdUE9nbB/BvQhOUIlUFREUBFWyruKKuith7w7Uj9t67IvZVsaDYFRR7R12xrK5drGtX7A0rTYogCJn3D1/yI0A0RCAYvp9zcg6ZuXPzJExuJk9uEQmCIICIiIiIiIiIiIjyhFjVARARERERERERERUkTMgRERERERERERHlISbkiIiIiIiIiIiI8hATckRERERERERERHmICTkiIiIiIiIiIqI8xIQcERERERERERFRHmJCjoiIiIiIiIiIKA8xIUdERERERERERJSHmJAjIiIiIiIiIiLKQ0zIERGpAVdXV0ycOFHVYeSpffv2wd7eHmFhYaoOhXLIxIkT4erqquowcsXy5cthb2+vssf39vZG7969Vfb4+cnEiRNRtWpVVYdB6YwaNQojRoxQdRhERER5igk5IqJ87NWrV/Dy8kLDhg3h5OSEatWq4c8//8SmTZvw5cuXPIkhMTERy5cvx+XLl/Pk8X5Vhw4dwsaNG1UdRr4VGxsLJycn2Nvb4+nTpwod86ude/k13tevX2PPnj0YMGCAdFtYWBjs7e2xYcMGmbKCIMDLywv29vZYvnz5Tz3u7du3MXPmTLRo0QJVqlRB/fr1MWLECDx//vyn6v1Vbd26Ffv27VN1GPlSv379cPz4cYSGhqo6FCIiojzDhBwRUT51+vRptGrVCkeOHEGDBg0wbdo0jBkzBiVKlMDChQsxZ86cPIkjMTERfn5+uHLlSp48nqLatGmD27dvw9LSUtWhAAD+/vtvBAYGqjqMfOvo0aMQiUQwMzPDwYMHFTomv5578nwv3kGDBuH27dsqiAoIDAyEpaUlfv/99++WEwQB3t7e2LlzJwYPHoxhw4b91OOuX78ex48fR61atTBlyhS4u7vj2rVraN++PR49evRTdf+Ktm/fjv3796s6jHypQoUKcHR0hL+/v6pDISIiyjOaqg6AiIgye/36NUaNGoUSJUpg06ZNMDc3l+7r1q0bXr58idOnT6suwByQkJAAPT09pY/X0NCAhoZGDkaUPyUmJkJXV1fVYfy0gwcPol69eihRogT+/vtvjBo1SmWx/Oy5pwxNTU1oaub9ZdfXr19x6NAh/Pnnnz8sO2vWLOzYsQMDBw7MkeGDHh4e8PHxgba2tnRb8+bN0apVK6xduxY+Pj4//RgAkJSUBC0tLYjF/J05v1DmPdasWTMsX74cnz9/RuHChXMpMiIiovyDVy5ERPnQ+vXrkZCQgDlz5sgk49LY2NigV69eco+XN19VVvOu3blzB3369EHNmjVRqVIluLq6YtKkSQC+DWurVasWAMDPzw/29vaZhrI9ffoUw4cPh7OzM5ycnNC+fXucPHkyy8e9cuUKvL29UatWLdSrV++7r8HmzZvRokULVK5cGTVq1ED79u1x6NCh7z4XiUSC5cuXo06dOqhcuTJ69OiBJ0+eZJpjL+3Y69evY968efj9999RpUoVDBkyBFFRUTJxnDhxAv3790edOnXg6OgINzc3rFixAqmpqdIyPXr0wOnTp/HmzRvpa5Q2F5q8ue4uX74Me3t7meGNPXr0QMuWLXH37l1069YNlStXxuLFiwEAycnJWLZsGRo1agRHR0fUq1cPCxYsQHJysky9Fy5cQJcuXVC9enVUrVoVTZo0kdahKm/fvsW1a9fQvHlztGjRAmFhYfjvv/++e0xenHtpr/eTJ0/Qo0cPVK5cGXXr1sW6detk6khOToavry/at2+P3377DVWqVEHXrl1x6dIlhePN+J5s2bIlevTokel5SyQS1K1bF8OHD5fZtnHjRrRo0QJOTk5wcXGBl5cXYmJivvsaAsD169cRHR0NFxeX75abPXs2tm7digEDBuRYsrRatWoyyTgAKFWqFMqVK4dnz54pVWfa++bw4cNYsmQJ6tati8qVKyM+Ph4AcOTIEbRv3x6VKlVCzZo1MXbsWHz48CHLul6/fo0+ffqgSpUqqFOnDvz8/CAIQqbHyjgEOW24b/rhp+Hh4Zg0aRL++OMPODo6ok6dOhg0aJD0fe/q6orHjx/jypUr0nMjq/9/mh49ekjLZbz9aNhrfHw85syZA1dXVzg6OqJWrVro3bs37t27J1Pu1q1b6NevH2rUqIEqVaqgVatW2LRpk0yZkJAQdO3aFVWqVEH16tUxaNCgTEPO087tJ0+eYMyYMahRowa6du0q3X/gwAHp/8TZ2RmjRo3Cu3fvMsXt4uKChIQEXLx48bvPj4iISF2whxwRUT506tQpWFtbo1q1arn6OJGRkejTpw+KFi2K/v37w8DAAGFhYfjnn38AAMbGxvD29oa3tzcaNWqERo0aAYA0sfD48WN06dIFFhYW6NevH/T09HDkyBEMGTIEy5cvl5ZPM2PGDBgbG2PIkCFISEiQG9euXbswe/ZsNGnSBD179kRSUhIePnyIW7duoVWrVnKPW7RoEdavX48GDRqgbt26CA0NRZ8+fZCUlJRl+dmzZ8PAwABDhw7FmzdvsGnTJsycORNLly6Vltm/fz/09PTQu3dv6Onp4dKlS1i2bBni4+MxYcIEAMDAgQMRFxeH9+/fS5OZyvbw+PTpE/r164cWLVqgdevWMDExgUQiwaBBg3D9+nW4u7vD1tYWjx49wqZNm/DixQusXLkSwLf/x4ABA2Bvb4/hw4dDW1sbL1++/GHyK7f9/fff0NXVRYMGDVCoUCGULFkShw4d+u75nVfnXkxMDPr27YtGjRqhWbNmOHbsGHx8fGBnZydN3MXHx2P37t1o2bIlOnXqhM+fP2PPnj3o27cvdu/ejfLly/8w3oyaNWsGPz8/hIeHw8zMTLr9+vXr+PjxI5o3by7d5uXlhf3796N9+/bo0aMHwsLCsHXrVty/fx/bt2+HlpaW3Nfxxo0bEIlEqFChgtwyc+fOxebNm9GvXz+MHj06036JRIJPnz7JPT49fX3978YjCAIiIiJQrlw5heqTZ+XKldDS0kKfPn2QnJwMLS0t7Nu3D5MmTYKTkxNGjx6NyMhIBAYG4r///kNQUBAMDAykx6empqJv376oXLkyxo0bh3PnzmH58uVITU1VqnfgsGHD8OTJE3Tv3h2WlpaIiorChQsX8O7dO1hZWWHy5MmYNWsW9PT0MHDgQACAqamp3PoGDhyIjh07ymw7ePAgzp8/DxMTk+/GMn36dBw7dgzdu3eHra0tPn36hOvXr+Pp06eoWLEigG+J+wEDBsDc3Bw9e/aEqakpnj59itOnT0t/7Ll48SL69esHKysrDB06FF++fMGWLVvQpUsX7Nu3D1ZWVjKPO2LECNjY2GDUqFHSxOaqVavg6+uLZs2aoWPHjoiKisKWLVvQrVu3TP+TsmXLolChQvjvv/8yvX+JiIjUkkBERPlKXFycYGdnJwwaNEjhYxo0aCBMmDBBen/ZsmWCnZ1dpnJ79+4V7OzshNevXwuCIAj//POPYGdnJ9y+fVtu3ZGRkYKdnZ2wbNmyTPt69eoltGzZUkhKSpJuk0gkQufOnYXGjRtnetwuXboIKSkpP3w+gwYNElq0aPHdMhmfS3h4uFChQgVh8ODBMuWWL18u2NnZybw+acd6eHgIEolEun3u3LlC+fLlhdjYWOm2xMTETI89bdo0oXLlyjLPu3///kKDBg1+GGeaS5cuCXZ2dsKlS5ek27p37y7Y2dkJ27dvlykbFBQkODg4CFevXpXZvn37dsHOzk64fv26IAiCEBAQINjZ2QmRkZGZ4lClli1bCmPGjJHeX7x4sVCzZk3h69evMuUmTJgg8xrm9rmX9nrv379fui0pKUmoXbu2MGzYMOm2lJQUmccRBEGIiYkRXFxchEmTJikUb8b35LNnzwQ7Ozth8+bNMuW8vb2FKlWqSM+7q1evCnZ2dsLBgwdlyp09ezbL7RmNHTt
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Médianes — Alternative:\n",
" flow_freq gross_flow_to_aum avg_n_isin_held flow_direction_balance log_aum_qty_mean months_since_last_tx_asset aum_final_to_peak aum_drawdown_last\n",
"cluster_alternative \n",
"0 0.085 1.037 1.000 0.104 5.777 12.0 0.914 0.086\n",
"1 0.069 4.729 0.512 -0.072 5.067 66.0 0.000 1.000\n",
"\n",
"============================================================\n",
"ASSET TYPE : Diversified\n",
"============================================================\n",
" k silhouette davies_bouldin\n",
" 2 0.6029 0.6519\n",
" 3 0.5108 0.8195\n",
" 4 0.4847 0.9830\n",
" 5 0.4697 0.8759\n",
" 6 0.3327 1.1264\n",
"→ K retenu : 2 (silhouette=0.6029)\n",
" n_comptes pct\n",
"cluster_diversified \n",
"0 3368 80.9\n",
"1 793 19.1\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABOQAAAGGCAYAAADbxV7qAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA5kBJREFUeJzs3Xd8jef/x/HXyUKEECGIHXtTo1YpqVmjFKWovam9G5vW3rVj791vY5QaRdCqvap2UGJFIiGSnN8ffjnNkUSPCIfk/Xw87scj5z7XfZ3Pfc6d65zzOdcwGI1GIyIiIiIiIiIiIvJO2Fg7ABERERERERERkcRECTkREREREREREZF3SAk5ERERERERERGRd0gJORERERERERERkXdICTkREREREREREZF3SAk5ERERERERERGRd0gJORERERERERERkXdICTkREREREREREZF3SAk5ERERERERERGRd0gJORGRBKJy5coMGDDA2mG8Uxs2bCBPnjz4+flZOxSJR35+fuTJk4cNGzZYO5RYNW/enObNm5vtu3fvHt27d6d06dLkyZOHRYsWcfjwYfLkycPhw4fj7bGnT59Onjx5LC7frl07hgwZEm+P/yFr3rw5n3/+ubXDkCgaNWrEuHHjrB2GiIjIO6eEnIjIe+769et4eXlRpUoVChUqRPHixfnqq69YvHgxT58+fScxhISEMH369HhNKiREP/30E4sWLbJ2GO+dPHnymLb8+fNTqlQp6tevz6hRo/j777+tHV68GTt2LL/99hvt27dn3LhxVKhQwdohcfToUQ4cOEC7du1M+yKThNu2bTMrGxoaSocOHcibNy/r1q17o8f19fVl4MCBVKtWjSJFilClShUGDx7M3bt336jeD9Xs2bPZuXOntcN4L7Vr144VK1bg7+9v7VBERETeKTtrByAiIrHbs2cP3377LQ4ODtStW5fcuXPz/Plzjh49yvjx4/n7778ZOXLkW48jJCSEGTNm0LVrV0qXLv3WH89SdevWpVatWjg4OFg7FAD+97//cfHiRVq2bGntUN475cqVo27duhiNRoKCgjh//jybNm1i5cqV9OnTh1atWpnKuru7c/LkSezs3t+PKQsWLIi279ChQ1SpUoU2bdqY9mXPnp2TJ09ib2//LsMzWbBgAWXKlCFr1qyvLPf8+XO6d+/O3r17GTlyJF9++eUbPe748eMJCAigevXqZMuWjRs3brBs2TL27NnDpk2bSJs27RvV/6GZM2cO1apVw9PT09qhvHeqVKmCk5MTK1as4Ntvv7V2OCIiIu/M+/tJV0Qkkbtx4wY9e/YkY8aMLF68mHTp0pnu+/rrr7l27Rp79uyxXoDxIDg4GEdHxzgfb2tri62tbTxG9H4KCQkhWbJk1g7jjWTLlo26deua7evduzedOnXi+++/J0eOHFSsWBEAg8FAkiRJ3nmMr3M9xpQEvn//PilTpjTbZ2NjY5VziYxn7969DBs27JXlnj9/To8ePdizZw8jRoygYcOGb/zYAwcO5KOPPsLG5t/BGBUqVKBZs2YsW7aMnj17vvFjwJu3IRL/Xvc1sbGxoVq1amzevJnu3btjMBjeYnQiIiLvDw1ZFRF5T82fP5/g4GBGjx5tloyLlDVrVr755ptYj49tnqmY5l07deoUbdq0oXTp0hQuXJjKlSszcOBA4MV8XmXKlAFgxowZpqGH06dPNx1/6dIlunfvTqlSpShUqBD169dn165dMT7ukSNHGDZsGGXKlDElYGKzdOlSatWqRZEiRShZsiT169fnp59+euW5REREMH36dMqXL0+RIkVo3rw5f//9d7Q59iKPPXr0KGPHjuXjjz+maNGidOnShQcPHpjFsXPnTtq3b0/58uUpWLAgnp6ezJw5k/DwcFOZ5s2bs2fPHm7evGl6jipXrhxrnECM84tFznF1+vRpvv76a4oUKcKkSZOAF0MKp02bxmeffUbBggWpWLEi48aNIzQ01KzeAwcO0KRJE0qUKEGxYsWoVq2aqY73SerUqZk0aRJ2dnb8+OOPpv0vzyG3YMEC8uTJw82bN6PVMXHiRAoWLEhAQIBp34kTJ2jTpg0fffQRRYoUoVmzZhw9etTsuMj/j7///pvevXtTsmRJmjZtCoC/vz8DBw7kk08+oWDBgpQvX55OnTqZvX5R55CLfH2NRiPLly83vf4Q82tsaYwAf/zxBw0aNKBQoUJ4enqyatUqi5/fPXv2EBYWRtmyZWMtExYWRq9evdi1axfDhg2jUaNGFtf/KiVLljRLxkXuS5UqFZcvX45Tnf/VhixfvpxatWqZXrPhw4fz+PHjGOs6ffo0X331lam9W7lyZYyPZcn/7NWrV+nWrRvlypWjUKFCfPLJJ/Ts2ZPAwEDgxZDt4OBgNm7caLo2XjXfZ+XKlc2GeUfd/mvaAEuuXYC9e/fSrFkzihUrRvHixWnQoIFZ2wqwdetW6tevT+HChSldujR9+vThzp07ZmUGDBhAsWLFuH79Ou3ataNYsWL06dMHeNEWL1q0iFq1alGoUCHKli2Ll5eX2f9qpLJly3Lz5k3OnTv3yvMTERFJSNRDTkTkPbV7924yZ85M8eLF3+rj3L9/nzZt2pA6dWrat29PypQp8fPz45dffgHAxcWFYcOGMWzYMD777DM+++wzAFPC4eLFizRp0gQ3NzfatWuHo6MjW7dupUuXLkyfPt1UPtLw4cNxcXGhS5cuBAcHxxrXmjVrGDVqFNWqVaNFixY8e/aMCxcucOLECWrXrh3rcRMnTmT+/Pl8+umnVKhQgfPnz9OmTRuePXsWY/lRo0aRMmVKunbtys2bN1m8eDEjRoxgypQppjIbN27E0dGRVq1a4ejoyKFDh5g2bRpBQUH0798fgI4dOxIYGMg///xjSmYmT578P579mD169Ih27dpRq1Yt6tSpQ5o0aYiIiKBTp04cPXqURo0a4eHhwV9//cXixYu5evUqs2bNAl68Hh06dCBPnjx0794dBwcHrl27xp9//hmnWN62jBkzUrJkSQ4fPkxQUBBOTk7RytSoUYPx48ezdetW2rZta3bf1q1bKVeuHM7OzsCLucvatWtHwYIF6dq1KwaDgQ0bNvDNN9+wYsUKChcubHb8t99+S9asWenZsydGoxGAbt268ffff9OsWTPc3d158OABBw4c4Pbt22TKlClafCVLlmTcuHH069fPNDT3VSyN8cKFC7Rp0wYXFxe6detGWFgY06dPJ02aNBY9t8eOHSNVqlS4u7vHeH94eDi9evXil19+wcvLi6+++ipamefPn5sSS/8lVapU0ZJwUT158oQnT56QOnVqi+qLTUxtyPTp05kxYwZly5alSZMmXLlyhZUrV3Lq1ClWrlxpNmQ4ICCA9u3bU6NGDWrVqsXWrVsZNmwY9vb2rz1UNzQ0lDZt2hAaGkqzZs1wdXXlzp077Nmzh8ePH5MiRQrGjRvHkCFDKFy4sCnhmSVLlljrHDRoEE+ePDHbt3jxYs6dO0eqVKleGY8l1+6GDRsYNGgQuXLlokOHDqRIkYJz587x22+/mdrWDRs2MHDgQAoVKkSvXr24f/8+S5Ys4c8//2TTpk1mPUHDwsJMyeX+/fuTNGlSALy8vNi4cSP169enefPm+Pn5sXz5cs6ePRvtNSlYsCAAf/75J/nz57fw2RcREfmwKSEnIvIeCgoK4s6dO1SpUuWtP9axY8cICAhgwYIFFCpUyLQ/ckiZo6Mj1apVY9iwYeTJkydasmH06NFkyJCB9evXm4bxNW3alCZNmjBhwoRoCTlnZ2cWLVr0n0NN9+zZQ65cuZg2bZrF53Lv3j0WLVpk6sEWacaMGWY9+qJKlSoVCxcuNA2TioiIYOnSpQQGBpIiRQrgRZIv8ksmQJMmTfDy8mLlypX07NkTBwcHypUrx5IlS3j8+PF/JmT+i7+/P8OHDzdLkGzevJmDBw+ydOlSSpQoYdqfK1cuhg4dyp9//knx4sU5cOAAz58/Z968ebi4uLxRHO9Krly58PX1xc/Pj7x580a7P2PGjBQtWhQfHx+zhNzJkye5ceMGXbt2BcBoNDJ
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Médianes — Diversified:\n",
" flow_freq gross_flow_to_aum avg_n_isin_held flow_direction_balance log_aum_qty_mean months_since_last_tx_asset aum_final_to_peak aum_drawdown_last\n",
"cluster_diversified \n",
"0 0.044 3.045 0.625 -0.577 5.064 80.0 0.000 1.000\n",
"1 0.085 0.218 1.000 -0.682 5.156 12.0 0.907 0.093\n",
"\n",
"============================================================\n",
"ASSET TYPE : Equity\n",
"============================================================\n",
" k silhouette davies_bouldin\n",
" 2 0.3701 1.3831\n",
" 3 0.4248 0.9505\n",
" 4 0.2858 1.3678\n",
" 5 0.2587 1.4438\n",
" 6 0.2779 1.3114\n",
"→ K retenu : 3 (silhouette=0.4248)\n",
" n_comptes pct\n",
"cluster_equity \n",
"0 2384 61.1\n",
"1 769 19.7\n",
"2 748 19.2\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABM4AAAGGCAYAAACDus3zAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA85RJREFUeJzs3XVYVGkbBvB7KBEpCRFQUVEQFVQWsQtdsUVdA7sDe9cOVESxCwxUVLATjEXFNXcVuxARXRuTku6Z7w8/ZhkBHYeYEe/fdZ1L55z3vPMcOLwz88wbApFIJAIRERERERERERFJUJJ3AERERERERERERIqIiTMiIiIiIiIiIqI8MHFGRERERERERESUBybOiIiIiIiIiIiI8sDEGRERERERERERUR6YOCMiIiIiIiIiIsoDE2dERERERERERER5YOKMiIiIiIiIiIgoD0ycERERERERERER5YGJMyIiysXBwQEzZsyQdxjF6siRI7C0tERERIS8QyEFNWPGDDg4OBTb823ZsgXt2rWDUCgstudUVJ6enrC0tERMTIy8Q6H/W7FiBXr27CnvMIiIiIocE2dERD+RV69ewdXVFa1bt4a1tTVsbW3Rp08f+Pr6IjU1tVhiSElJgaenJ65du1Ysz/ejOn78OHbs2CHvMBSOpaVlvpurq2uxxlKU93JiYiK2bt2KESNGQEnpv7drlpaWcHNzy1V+06ZNsLS0xMyZMwuUaPvw4QOmTJkCR0dH1KtXD3Z2dvjtt9/g7+8PkUgkc70/Kv4d5m/QoEF49OgRzp49K+9QiIiIipSKvAMgIqLiceHCBUycOBFqamro2rUrLCwskJGRgVu3bmH58uX4999/sXDhwiKPIyUlBV5eXhg3bhwaNGhQ5M8nra5du6Jjx45QU1OTdygAgBMnTuDJkycYPHiwvENROE2aNEHXrl1z7a9SpUqRPu/ChQslkkdFeS8fOnQImZmZ6NSp0zfLbt68GatXr0a3bt2waNEiiUTb94qNjcWHDx/Qrl07GBsbIzMzE5cvX8aMGTPw/Plz/P777zLX/SPi32H+DA0N0bp1a2zbtg2tW7eWdzhERERFhokzIqKfwOvXrzF58mSYmJjA19cX5cqVEx/r168fXr58iQsXLsgvwEKQnJwMDQ0Nmc9XVlaGsrJyIUakmFJSUlC6dGl5h1EglStXzjNxVtRUVVWL7bmOHDkCBwcHlCpV6qvltm7dipUrV8LJyQmLFy8uUNIMAGrUqIGdO3dK7Ovfvz9Gjx6NnTt3YuLEiYXyd5KZmQmhUKgwiWqSrQ1t3749Jk6ciNevX6NixYpFFBkREZF8cagmEdFPYOvWrUhOTsaiRYskkmbZzMzMMGjQoHzPz55f6Et5zQsWEhKCYcOGoUGDBrCxsYGDgwNmzpwJAIiIiECjRo0AAF5eXuIhdp6enuLznz59igkTJsDe3h7W1tbo3r17rqFA2c97/fp1zJ8/H40aNUKLFi2++jPYuXMnOnbsiDp16qB+/fro3r07jh8//tVrEQqF8PT0RNOmTVGnTh0MGDAA//77b6454LLPvXXrFjw8PNCwYUPUrVsXY8eOzTUn019//YWRI0eiadOmqF27Ntq0aYP169cjKytLXGbAgAG4cOEC3rx5I/4ZZc+tld9cbNeuXYOlpaXEsMEBAwagU6dOePDgAfr164c6depg1apVAID09HSsW7cOv/76K2rXro0WLVpg2bJlSE9Pl6j38uXLcHZ2hp2dHerVqwdHR0dxHYpu//79aNOmDWxsbPDbb7/h5s2bGDBgAAYMGCAu8z0/z5xznH3tXj58+DAsLS3x8OHDXDFt2rQJVlZW+PDhQ75xv379GuHh4WjcuPFXr2/79u1Yvnw5unTpAg8PjwInzb7G1NQUKSkpyMjI+O5zIyIiYGlpCR8fH+zYsQNt2rSBtbU1nj59CgAIDg5G3759UbduXdjZ2WHMmDHiY1+KjY3FxIkTYWtriwYNGsDd3R1paWm5nuvIkSO5zv2yrUlMTMSiRYvg4OCA2rVro1GjRhgyZAhCQ0MBfP3vMC8zZszIdxhxzufNS0ZGBry8vNC2bVtYW1ujQYMGcHZ2xuXLlyXKPX36FBMnTkTDhg1hY2MDR0dHrF69WqLMw4cPMXz4cNja2qJevXoYNGgQ7t69K1HmW23oxYsXxb+TevXqYeTIkXjy5EmuuLPvUQ7XJCKikow9zoiIfgLnz59HxYoVYWtrW6TPEx0djWHDhqFs2bIYOXIktLW1ERERgTNnzgAA9PT0MH/+fMyfPx+//vorfv31VwAQJ+WePHkCZ2dnGBkZYcSIEdDQ0MDJkycxduxYeHp6istnW7BgAfT09DB27FgkJyfnG9eBAwfg7u4OR0dHDBw4EGlpaQgPD8e9e/fQuXPnfM9buXIltm7dilatWqFZs2Z49OgRhg0bJvFBPSd3d3doa2tj3LhxePPmDXx9feHm5oY1a9aIy/j7+0NDQwNDhgyBhoYGrl69inXr1iExMRHTp08HAIwePRoJCQl4//69OOlYpkyZb/z08/bp0yeMGDECHTt2RJcuXaCvrw+hUIgxY8bg1q1b6NWrF8zNzfH48WP4+vrixYsX2LBhA4DPv49Ro0bB0tISEyZMgJqaGl6+fInbt2/LFEthSUtLy3OSeE1NTXEPpoMHD8LV1VWcOHj9+jXGjBkDHR0dGBsbFziGr93LFSpUgJubG44fP46aNWtKnHf8+HHY29vDyMgo37rv3LkDALnOzcnX1xdLlixBp06dsGTJkjyTZtJOpJ/z55YtNTUVycnJSE5Oxo0bN3DkyBHUrVsX6urqUtWZlyNHjiAtLQ29evWCmpoadHR0cOXKFYwYMQIVKlTAuHHjkJqail27dsHZ2RlHjhxBhQoVJOqYNGkSTE1N8ccff+Du3bvYuXMn4uPjsWzZsu+OZ968eTh9+jT69+8Pc3NzfPr0Cbdu3cLTp09Rq1at7/477N27tziZmu3vv//G8ePHoaen99VYvLy84O3tjZ49e8LGxgaJiYl48OABQkND0aRJEwDAo0eP0K9fP6ioqKB3794wNTXFq1evcO7cOUyePBnA57/Zfv36oUyZMhg+fDhUVFSwf/9+DBgwALt27UKdOnUknjevNjQgIAAzZsxA06ZNMWXKFKSkpGDv3r3o27cv/P39JX4nWlpaqFSpEm7fvs3hrEREVGIxcUZEVMIlJibiw4cPxTIHzZ07dxAXFwcfHx9YW1uL92d/qNPQ0ICjoyPmz58PS0vLXMPtFi1aBGNjYxw+fFj8Qb5v375wdnbGihUrciXOdHR0sGPHjm8OHbtw4QKqV6+OdevWSX0tUVFR4t4x69evF+/38vLKt/eIrq4utm3bBoFAAOBzj7WdO3ciISEBWlpaAD4n43ImH5ydneHq6oq9e/di8uTJUFNTQ5MmTeDn54f4+PgCD0mMjIzEggUL0KdPH/G+o0eP4sqVK9i5cyfs7OzE+6tXr4558+bh9u3bsLW1xeXLl5GRkYEtW7Z884N/cTp06BAOHTqUa/+qVavQsWNHZGRkYPXq1bCysoKfn5/4XqpWrRrmzp1bKImzb93Lbdq0wYkTJzB16lRxUuvhw4f4999/MWzYsK/W/ezZMwDIlTTKlt0LqlOnTli2bFm+9/+XSZz8eHh4oHv37hL7/Pz8sHLlSom6PDw8pKovP+/fv8eZM2ck7iUXFxfo6Ohg//790NXVBfD5Z9etWzd4enpi6dKlEnVUqFABGzduBPB5mLmmpib27NmDoUOHokaNGt8Vz8WLF9GrVy+J3qMjRowQ//97/w7r1auHevXqiR+/fPkSCxcuRJMmTST+/vJy4cIFtGjR4qvzTLq7u0MkEsHf3x8mJibi/VOmTBH/f82aNcjIyMDevXvFQyednJzQrl07LF++HLt27ZKo88s2NCkpCYsWLULPnj0lYunWrRvatWsHb2/vXDFWrFgR//7771evj4iI6EfGxBkRUQmXmJgIQPYeS98jOzl04cIF1KhR47vmhPr06ROuXr2KCRMmiGPO1rRpU3h6euLDhw8SPXV69eol1XxL2tr
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Médianes — Equity:\n",
" flow_freq gross_flow_to_aum avg_n_isin_held flow_direction_balance log_aum_qty_mean months_since_last_tx_asset aum_final_to_peak aum_drawdown_last\n",
"cluster_equity \n",
"0 0.025 3.296 0.576 -0.835 3.976 90.0 0.000 1.000\n",
"1 0.071 0.064 1.056 -0.935 4.554 12.0 0.975 0.025\n",
"2 0.646 3.610 3.588 -0.099 8.474 0.0 0.154 0.846\n",
"\n",
"============================================================\n",
"ASSET TYPE : Fixed Income\n",
"============================================================\n",
" k silhouette davies_bouldin\n",
" 2 0.6758 0.5130\n",
" 3 0.4226 0.8457\n",
" 4 0.4348 0.9977\n",
" 5 0.4601 0.9193\n",
" 6 0.4386 0.9478\n",
"→ K retenu : 2 (silhouette=0.6758)\n",
" n_comptes pct\n",
"cluster_fixed_income \n",
"0 3142 79.9\n",
"1 792 20.1\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABOQAAAGGCAYAAADbxV7qAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA6BVJREFUeJzs3XdUE8vbB/BvQhHpAoKKiooCNhQLNrw27L1hb6goCooNsWEXr2JBsYvYu4J6xd4Lluu1945YaQoIipB9//AlPwJEQyhR/H7OyTlkdnbyhGwmmyczOyJBEAQQERERERERERFRnhCrOgAiIiIiIiIiIqI/CRNyREREREREREREeYgJOSIiIiIiIiIiojzEhBwREREREREREVEeYkKOiIiIiIiIiIgoDzEhR0RERERERERElIeYkCMiIiIiIiIiIspDTMgRERERERERERHlISbkiIiIiIiIiIiI8hATckREv7jGjRvDy8tL1WHkqb1798La2hrh4eGqDoWyoU+fPujTp0+eP66XlxcaN26c54/7u1uzZg1atGgBiUSi6lBUbunSpbC2tkZ0dLSqQ6H/5+vri65du6o6DCIiohzDhBwRkYqEhYXB29sbTZo0QeXKlVGtWjV0794dGzZswJcvX/IkhsTERCxduhSXL1/Ok8f7XR04cADr169XdRi/HGtr60xv9erVU3VoCunTpw/atGmj6jB+CfHx8Vi7di0GDx4Msfh/p4fW1taYMWNGhvorV66EtbU1JkyYkK0E3tOnTzFv3jy0b98ednZ2cHBwgIuLC27fvq10m78z9jXy9evXDw8ePMCJEydUHQoREVGOUFd1AEREf6LTp09j5MiR0NTURPv27WFlZYVv377h2rVrmD9/Pp48eYKZM2fmehyJiYnw9/eHm5sbatWqleuPp6j27dujdevW0NTUVHUoAIB//vkHjx8/Rv/+/VUdyi+nXr16aN++vUyZlpYWACAgIEAVIZESdu/ejeTkZIUSlKtXr8aiRYvQsWNHzJ49WyaBp8zj7t69G82aNUPPnj0RFxeHHTt2oFu3bli7di3q1q2rdNu/I/Y18hUuXBhNmjTBunXr0KRJE1WHQ0RElG1MyBER5bFXr15h1KhRKFasGDZs2ABTU1Pptl69euHly5c4ffq06gLMAQkJCdDW1lZ6fzU1NaipqeVgRL+mxMREFCxYUNVhZEupUqUyJORS/SoJVfq5vXv3onHjxihQoMAP661duxYLFixAhw4dMGfOnGwl4wCgdevWcHNzg46OjrSsc+fOaNWqFZYuXZpjCbnk5GRIJBIek78QZT4nWrZsiZEjR+LVq1coUaJELkVGRESUNzhllYgoj61duxYJCQmYPXu2TDIulYWFBfr16yd3/9RrG6WX2XXXbt++jYEDB6JWrVqwtbVF48aNMWHCBABAeHg46tSpAwDw9/eXTjdcunSpdP+nT59ixIgRsLe3R+XKldGpU6cM04VSH/fKlSuYNm0a6tSpgwYNGvzwf7Bp0ya0bt0aVapUQc2aNdGpUyccOHDgh89FIpFg6dKlcHBwQJUqVdCnTx88efIkwzX2Uve9du0afHx8ULt2bVStWhXDhw/PcD2o48ePw8XFBQ4ODqhUqRIcHR2xbNkypKSkSOv06dMHp0+fxuvXr6X/o9Trk8m71t3ly5dhbW0tMxU4dXrknTt30KtXL1SpUgULFy4EACQlJWHJkiVo2rQpKlWqhAYNGmDevHlISkqSaffChQvo0aMHatSoATs7OzRv3lzaxq8o/TXkxo8fj8qVK+Pp06cy9QYOHIiaNWvi/fv30rIzZ86gZ8+eqFq1Kuzs7ODi4oLHjx9neIzjx4+jTZs2qFy5Mtq0aYNjx45lK+bUKZqp7VaqVAmtW7fG2bNnM9R9//49Jk6cKD1+GjdujKlTp8q8bq9evZK+h6pUqQInJ6cMCffU4yUkJAT+/v6oX78+7OzsMGLECMTFxSEpKQmzZ89GnTp1YGdnhwkTJmQ4NgBg37596NSpE2xtbWFvb49Ro0bh7du3P33Or169wsOHD3+a/AoMDMT8+fPRrl07+Pj4ZDsZBwCVKlWSScYBQKFChVCjRg08e/ZMqTbDw8NhbW2NgIAArF+/Ho6OjjLHXWhoqPTYqlGjBlxdXTMck6liYmIwcuRIVKtWDbVq1cKsWbPw9evXDI+1d+/eDPum70/j4+Mxe/ZsNG7cGJUqVUKdOnUwYMAA3L17F8CP+5rMeHl5yZ02nvZxM/Pt2zf4+/ujWbNmqFy5MmrVqoUePXrgwoULMvWePn2KkSNHonbt2rC1tUXz5s2xaNEimTr37t3DoEGDUK1aNdjZ2aFfv364ceOGTJ2ffU4o+n5PPUY5bZWIiPIDjpAjIspjp06dQokSJVCtWrVcfZyoqCgMHDgQhQoVgouLC/T19REeHi5NWBgZGWHatGmYNm0amjZtiqZNmwKANNn3+PFj9OjRA2ZmZhg8eDC0tbVx6NAhDB8+HEuXLpXWTzV9+nQYGRlh+PDhSEhIkBvXzp07MWvWLDRv3hx9+/bF169f8fDhQ9y8eRNt27aVu9+CBQuwdu1aNGrUCPXr18eDBw8wcOBAmS/Hac2aNQv6+vpwc3PD69evsWHDBsyYMQOLFy+W1gkKCoK2tjYGDBgAbW1tXLp0CUuWLEF8fDzGjx8PABg6dCji4uLw7t07aTIzfQJBUR8/fsTgwYPRunVrtGvXDsbGxpBIJHB1dcW1a9fg5OQES0tLPHr0CBs2bMCLFy+wfPlyAN9fjyFDhsDa2hojRoyApqYmXr58if/++0+pWHLK169fMyQ6dXV1Mx2JNGnSJFy6dAnjx4/Hjh07oKamhu3bt+P8+fOYN28ezMzMAADBwcHw8vKCg4MDxo4di8TERGzbtg09e/ZEUFAQihcvDgA4f/483N3dUbZsWYwZMwYxMTGYMGECihQpkq3ndO3aNRw9ehQ9e/aEjo4ONm3ahBEjRuDUqVMoVKgQgO/JuC5duiAuLg5OTk4oU6YM3r9/jyNHjuDLly/Q1NREZGQkunfvjsTERPTp0weFChVCUFAQXF1dpQnYtFavXg0tLS24uLjg5cuX2Lx5M9TV1SESiRAbGws3NzfcvHkTe/fuhbm5Odzc3KT7rlixAn5+fmjZsiW6dOmC6OhobN68Gb169UJwcDD09fXlPt/r168DACpUqCC3zoYNGzB37ly0adMGc+fOzTQZp+gCCPKOj7QiIiJgaGioUHvy7N27F1+/foWTkxM0NTVhYGCAixcvYvDgwShevDjc3Nzw5csXbN68GT169MDevXulx1YqDw8PmJubY8yYMbhx4wY2bdqE2NhYzJs3L8vxTJ06FUeOHEHv3r1haWmJjx8/4tq1a3j69CkqVqyY5b6mW7du0h9VUp07dw4HDhyAkZHRD2Px9/fHqlWr0LVrV9ja2iI+Ph537tzB3bt3pdeAfPDgAXr16gV1dXV069YN5ubmCAsLw8mTJzFq1CgA3/ulXr16QUdHB4MGDYK6ujp27NiBPn36YPPmzahSpYrM42b2OaHo+x0A9PT0ULJkSfz333+c1ktERL8/gYiI8kxcXJxgZWUluLq6KrxPo0aNhPHjx0vvL1myRLCysspQb8+ePYKVlZXw6tUrQRAE4dixY4KVlZVw69YtuW1HRUUJVlZWwpIlSzJs69evn9CmTRvh69ev0jKJRCJ069ZNaNasWYbH7dGjh5CcnPzT5+Pq6iq0bt36h3XSP5eIiAihQoUKwrBhw2TqLV26VLCyspL5/6Tu279/f0EikUjL58yZI5QvX16IjY2VliUmJmZ47ClTpghVqlSRed4uLi5Co0aNfhpnqkuXLglWVlbCpUuXpGW9e/cWrKyshG3btsnUDQ4OFmxsbISrV6/KlG/btk2wsrISrl27JgiCIAQGBgpWVlZCVFRUhjhUxcrKKtPbnj17BEH4/px79+4ts8+5c+cEKysrYfny5UJYWJhQtWpVmdc1Pj5eqFGjhjB58mSZ/SIiIoTq1avLlLdv316oV6+ezGt6/vx5wcrKKtPXK73evXtnOBatrKyEihU
"text/plain": [
"<Figure size 1400x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Médianes — Fixed Income:\n",
" flow_freq gross_flow_to_aum avg_n_isin_held flow_direction_balance log_aum_qty_mean months_since_last_tx_asset aum_final_to_peak aum_drawdown_last\n",
"cluster_fixed_income \n",
"0 0.060 6.231 0.48 0.000 5.142 69.0 0.000 1.000\n",
"1 0.182 2.224 1.50 0.472 7.279 2.0 0.998 0.002\n",
"\n",
"============================================================\n",
"RÉSUMÉ — Clustering par asset type\n",
"============================================================\n",
" Alternative : K=2, sil=0.4568, n=1319\n",
" Diversified : K=2, sil=0.6029, n=4161\n",
" Equity : K=3, sil=0.4248, n=3901\n",
" Fixed Income : K=2, sil=0.6758, n=3934\n"
]
}
],
"source": [
"# ============================================================\n",
"# CLUSTERING PAR TYPE D'ASSET\n",
"# ============================================================\n",
"\n",
"# ── 0. Vérification des types d'assets disponibles ────────────────────────\n",
"print(\"=== Types d'assets disponibles ===\")\n",
"print(df_aum[ASSET_COL].value_counts())\n",
"\n",
"# ── 1. Construction du panel par asset type ───────────────────────────────\n",
"# Pour chaque compte x asset type, on agrège les features comportementales\n",
"\n",
"# Merge AUM avec flows pour avoir les flux par asset type\n",
"df_rel_m_asset = df_rel_m.copy()\n",
"df_rel_m_asset = df_rel_m_asset.merge(\n",
" df_aum[[ID_COL, ISIN_COL, \"month\", ASSET_COL]].drop_duplicates(),\n",
" on=[ID_COL, ISIN_COL, \"month\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"# Panel mensuel par compte x asset type\n",
"tmp_asset = df_rel_m_asset.copy()\n",
"tmp_asset[\"isin_held_flag\"] = (tmp_asset[\"aum_qty\"] > 0).astype(int)\n",
"tmp_asset[\"isin_active_flag\"] = (tmp_asset[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"df_month_asset = (\n",
" tmp_asset.dropna(subset=[ASSET_COL])\n",
" .groupby([ID_COL, ASSET_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" aum_qty = (\"aum_qty\", \"sum\"),\n",
" net_flow_qty = (\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty = (\"sub_qty\", \"sum\"),\n",
" red_qty = (\"red_qty\", \"sum\"),\n",
" n_tx = (\"n_tx\", \"sum\"),\n",
" n_isin_held = (\"isin_held_flag\", \"sum\"),\n",
" )\n",
" .sort_values([ID_COL, ASSET_COL, \"month\"])\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"df_month_asset[\"active_month\"] = (df_month_asset[\"gross_flow_qty\"] > 0).astype(int)\n",
"df_month_asset[\"flow_direction\"] = np.where(\n",
" df_month_asset[\"gross_flow_qty\"] > 0,\n",
" df_month_asset[\"net_flow_qty\"] / df_month_asset[\"gross_flow_qty\"],\n",
" np.nan\n",
")\n",
"df_month_asset[\"sub_share\"] = np.where(\n",
" df_month_asset[\"gross_flow_qty\"] > 0,\n",
" df_month_asset[\"sub_qty\"] / df_month_asset[\"gross_flow_qty\"],\n",
" np.nan\n",
")\n",
"df_month_asset[\"aum_peak\"] = df_month_asset.groupby(\n",
" [ID_COL, ASSET_COL]\n",
")[\"aum_qty\"].cummax()\n",
"df_month_asset[\"aum_drawdown\"] = np.where(\n",
" df_month_asset[\"aum_peak\"] > 0,\n",
" 1 - df_month_asset[\"aum_qty\"] / df_month_asset[\"aum_peak\"],\n",
" np.nan\n",
")\n",
"\n",
"print(\"\\ndf_month_asset shape:\", df_month_asset.shape)\n",
"\n",
"# ── 2. Feature engineering par compte x asset type ────────────────────────\n",
"# months_since_last_tx par asset\n",
"reference_date = df_month_asset[\"month\"].max()\n",
"\n",
"last_active_asset = (\n",
" df_month_asset[df_month_asset[\"active_month\"] == 1]\n",
" .groupby([ID_COL, ASSET_COL])[\"month\"]\n",
" .max()\n",
" .reset_index(name=\"last_active_month\")\n",
")\n",
"last_active_asset[\"months_since_last_tx_asset\"] = (\n",
" (reference_date.to_period(\"M\") - last_active_asset[\"last_active_month\"].dt.to_period(\"M\"))\n",
" .apply(lambda x: x.n)\n",
")\n",
"\n",
"df_client_asset = (\n",
" df_month_asset.groupby([ID_COL, ASSET_COL], as_index=False)\n",
" .agg(\n",
" n_months = (\"month\", \"nunique\"),\n",
" n_active_months = (\"active_month\", \"sum\"),\n",
" flow_freq = (\"active_month\", \"mean\"),\n",
" aum_qty_mean = (\"aum_qty\", \"mean\"),\n",
" aum_qty_max = (\"aum_qty\", \"max\"),\n",
" aum_qty_last = (\"aum_qty\", \"last\"),\n",
" gross_flow_qty_sum = (\"gross_flow_qty\", \"sum\"),\n",
" net_flow_qty_sum = (\"net_flow_qty\", \"sum\"),\n",
" n_tx_total = (\"n_tx\", \"sum\"),\n",
" avg_n_isin_held = (\"n_isin_held\", \"mean\"),\n",
" flow_direction_mean = (\"flow_direction\", \"mean\"),\n",
" sub_share_mean = (\"sub_share\", \"mean\"),\n",
" aum_drawdown_last = (\"aum_drawdown\", \"last\"),\n",
" )\n",
")\n",
"\n",
"# Merge months_since_last_tx\n",
"df_client_asset = df_client_asset.merge(\n",
" last_active_asset[[ID_COL, ASSET_COL, \"months_since_last_tx_asset\"]],\n",
" on=[ID_COL, ASSET_COL], how=\"left\"\n",
")\n",
"max_months = df_client_asset[\"months_since_last_tx_asset\"].max()\n",
"df_client_asset[\"months_since_last_tx_asset\"] = (\n",
" df_client_asset[\"months_since_last_tx_asset\"].fillna(max_months + 1)\n",
")\n",
"\n",
"# Ratios protégés\n",
"df_client_asset[\"gross_flow_to_aum\"] = np.where(\n",
" df_client_asset[\"aum_qty_mean\"] > 1,\n",
" df_client_asset[\"gross_flow_qty_sum\"] / df_client_asset[\"aum_qty_mean\"],\n",
" np.nan\n",
")\n",
"df_client_asset[\"flow_direction_balance\"] = np.where(\n",
" df_client_asset[\"gross_flow_qty_sum\"] > 0,\n",
" df_client_asset[\"net_flow_qty_sum\"] / df_client_asset[\"gross_flow_qty_sum\"],\n",
" np.nan\n",
")\n",
"df_client_asset[\"aum_final_to_peak\"] = np.where(\n",
" df_client_asset[\"aum_qty_max\"] > 0,\n",
" np.clip(df_client_asset[\"aum_qty_last\"] / df_client_asset[\"aum_qty_max\"], 0, 1),\n",
" np.nan\n",
")\n",
"df_client_asset[\"log_aum_qty_mean\"] = np.log1p(\n",
" df_client_asset[\"aum_qty_mean\"].clip(lower=0)\n",
")\n",
"\n",
"# Filtre qualité\n",
"df_client_asset = df_client_asset[\n",
" (df_client_asset[\"n_months\"] >= 6) &\n",
" (df_client_asset[\"aum_qty_mean\"] > 0)\n",
"].copy()\n",
"\n",
"print(\"df_client_asset shape:\", df_client_asset.shape)\n",
"print(\"\\nComptes par asset type:\")\n",
"print(df_client_asset.groupby(ASSET_COL)[ID_COL].nunique().sort_values(ascending=False))\n",
"\n",
"# ── 3. Sélection des asset types avec suffisamment de comptes ─────────────\n",
"min_accounts = 50\n",
"asset_counts = df_client_asset.groupby(ASSET_COL)[ID_COL].nunique()\n",
"valid_assets = asset_counts[asset_counts >= min_accounts].index.tolist()\n",
"print(f\"\\nAsset types retenus (>= {min_accounts} comptes) : {valid_assets}\")\n",
"\n",
"# ── 4. Features pour le clustering par asset ──────────────────────────────\n",
"asset_features = [\n",
" \"flow_freq\",\n",
" \"gross_flow_to_aum\",\n",
" \"avg_n_isin_held\",\n",
" \"flow_direction_balance\",\n",
" \"log_aum_qty_mean\",\n",
" \"months_since_last_tx_asset\",\n",
" \"aum_final_to_peak\",\n",
" \"aum_drawdown_last\",\n",
"]\n",
"\n",
"# ── 5. Clustering par asset type ──────────────────────────────────────────\n",
"ASSET_RESULTS = {}\n",
"\n",
"for asset in valid_assets:\n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"ASSET TYPE : {asset}\")\n",
" print(f\"{'='*60}\")\n",
"\n",
" df_a = df_client_asset[df_client_asset[ASSET_COL] == asset].copy()\n",
" feats = [c for c in asset_features if c in df_a.columns]\n",
"\n",
" # Preprocessing\n",
" d = df_a.copy()\n",
" d[\"flow_direction_balance\"] = d[\"flow_direction_balance\"].fillna(0)\n",
"\n",
" for col in [\"avg_n_isin_held\", \"months_since_last_tx_asset\",\n",
" \"aum_drawdown_last\", \"aum_final_to_peak\"]:\n",
" if col not in d.columns:\n",
" continue\n",
" vals = d[col].to_numpy(dtype=float)\n",
" med = np.nanmedian(vals)\n",
" mad = np.nanmedian(np.abs(vals - med)) * 1.4826\n",
" if mad > 0:\n",
" d[col] = np.clip(vals, med - 3*mad, med + 3*mad)\n",
" else:\n",
" d[col] = np.clip(vals, 0, np.nanpercentile(vals, 95))\n",
"\n",
" for col in [\"gross_flow_to_aum\"]:\n",
" if col not in d.columns:\n",
" continue\n",
" vals = d[col].to_numpy(dtype=float)\n",
" d[col] = np.log1p(np.clip(vals, 0, np.nanpercentile(vals, 90)))\n",
"\n",
" for col in [\"flow_freq\"]:\n",
" if col not in d.columns:\n",
" continue\n",
" vals = d[col].to_numpy(dtype=float)\n",
" d[col] = np.log1p(np.clip(vals, 0, None))\n",
"\n",
" X_a = d[feats].fillna(d[feats].median()).to_numpy()\n",
" X_a_scaled = RobustScaler().fit_transform(X_a)\n",
"\n",
" # K-selection\n",
" best_k = 2\n",
" best_sil = -1\n",
" rows_k = []\n",
" max_k = min(6, len(df_a) // 50) # K max raisonnable selon taille\n",
"\n",
" for k in range(2, max_k + 1):\n",
" km = KMeans(n_clusters=k, n_init=30, random_state=RANDOM_STATE)\n",
" labels = km.fit_predict(X_a_scaled)\n",
" sil = silhouette_score(X_a_scaled, labels)\n",
" db = davies_bouldin_score(X_a_scaled, labels)\n",
" rows_k.append({\"k\": k, \"silhouette\": round(sil, 4), \"davies_bouldin\": round(db, 4)})\n",
" if sil > best_sil:\n",
" best_sil = sil\n",
" best_k = k\n",
"\n",
" df_k = pd.DataFrame(rows_k)\n",
" print(df_k.to_string(index=False))\n",
" print(f\"→ K retenu : {best_k} (silhouette={best_sil:.4f})\")\n",
"\n",
" # Clustering final\n",
" km_final = KMeans(n_clusters=best_k, n_init=50, random_state=RANDOM_STATE)\n",
" df_a[f\"cluster_{asset.lower().replace(' ','_')}\"] = km_final.fit_predict(X_a_scaled)\n",
" cluster_col = f\"cluster_{asset.lower().replace(' ','_')}\"\n",
"\n",
" # Tailles\n",
" counts = df_a[cluster_col].value_counts().sort_index()\n",
" props = counts / counts.sum() * 100\n",
" print(pd.DataFrame({\"n_comptes\": counts, \"pct\": props.round(1)}))\n",
"\n",
" # Heatmap comportement\n",
" profile_vars_asset = [c for c in asset_features if c in df_a.columns]\n",
" prof = plot_heatmap(\n",
" df_a, profile_vars_asset, cluster_col,\n",
" title=f\"Cluster signatures — {asset} (K={best_k}, robust z-score)\",\n",
" figsize=(14, 4)\n",
" )\n",
" print(f\"\\nMédianes — {asset}:\")\n",
" print(prof.round(3).to_string())\n",
"\n",
" ASSET_RESULTS[asset] = {\n",
" \"df\": df_a,\n",
" \"cluster_col\": cluster_col,\n",
" \"k\": best_k,\n",
" \"silhouette\": best_sil,\n",
" \"profile\": prof,\n",
" }\n",
"\n",
"# ── 6. Résumé global ──────────────────────────────────────────────────────\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"RÉSUMÉ — Clustering par asset type\")\n",
"print(\"=\"*60)\n",
"for asset, res in ASSET_RESULTS.items():\n",
" print(f\" {asset:20s} : K={res['k']}, sil={res['silhouette']:.4f}, \"\n",
" f\"n={len(res['df'])}\")"
]
},
{
"cell_type": "code",
"execution_count": 207,
"id": "78b9b46c-577c-4e00-80ef-07e4aeb807be",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Colonnes disponibles : ['Registrar Account - ID', 'cluster_k4', 'cluster_alternative', 'cluster_diversified', 'cluster_equity', 'cluster_fixed_income']\n",
"Shape : (7179, 6)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABs8AAATMCAYAAADI7xmhAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xd8jef/x/F3phUriJLE7okRe28JtVe01O4wW1WlWnRS1PiWtkZrU5SqPWqV2ntVqwQ1agchkpCIJPfvD8n5OTKRnEPyej4eHu257vW573MnOZ/zua/rsjMMwxAAAAAAAAAAAAAA2ds6AAAAAAAAAAAAAOB5QfEMAAAAAAAAAAAAiEHxDAAAAAAAAAAAAIhB8QwAAAAAAAAAAACIQfEMAAAAAAAAAAAAiEHxDAAAAAAAAAAAAIhB8QwAAAAAAAAAAACIQfEMAAAAAAAAAAAAiEHxDAAAAAAAAAAAAIhB8QwAAAAJ6tKli7y8vGwdRro3ceJEeXl5ad++fbYOJVX4+vrK19fX1mGkiGXLlsnLy0vLli17pv14eXmpS5cuKRRVXIMHD5aXl5cuXbqUasdIKdz/AAAAAKzN0dYBAAAAIHHHjh3TggULdPDgQV2/fl3R0dFyc3NT+fLl1bp1a9WsWdPWIaY7ly5dUv369eXn56fRo0c/1T6e5/e1S5cu2r9/v06ePGmzGJ4HkZGR+u2337R+/XodO3ZMt2/flqOjo/LmzavSpUurUaNGql+/vuzteSbxSXH/41mtWLFCgwYNkiQtXrxYZcqUsXFE8Vu2bJmGDBmiUaNGqU2bNkmuv2/fPnXt2jXZ+69SpYrmzZv3LCECAAAgHhTPAAAAnlPR0dEaM2aM5syZI0dHR1WrVk2+vr5ydHTUxYsXtW3bNq1atUrvv/+++vTpkyoxjBkzRmFhYamy7/TqeXhfnzdz5syxdQhxXL58We+9956OHz+unDlzqnr16sqfP7+io6N1+fJl7dixQ6tXr1aDBg00efJkW4f7wuD+j+t5vP9fBEuWLJGdnZ0Mw9DSpUuf2+LZk3J3d9d7771n0RYcHKy5c+fK3d1dfn5+cdYHAABAyqN4BgAA8Jz67rvvNGfOHJUoUUITJkxQgQIFLJaHh4dr/vz5CgoKSrUY8ufPn2r7Tq+eh/f1efP4NbC10NBQdevWTefOnVP37t3Vt29fZcyY0WKdBw8eaM2aNdqyZYuNonwxcf/H9bzd/y+C8+fP68CBA/L19dXZs2f122+/aciQIXF+Tl9EHh4e6tu3r0XbpUuXzMWzx5cBAAAgdTC+CAAAwHPov//+04wZM5QjRw7NmDEj3i9XM2bMqO7du+v99983t8XOY3Tx4kXNmjVLTZs2lbe3twYPHmxe59SpU+rXr5+qV68ub29v+fr6auTIkbp9+3acY8Q351l0dLQWL16s1157TVWqVFGZMmVUp04d9e7dO945iQ4cOKDevXuratWq8vb2VsOGDfXtt9/G6dG2b98+eXl5aeLEiTp8+LC6dOmi8uXLq1q1aho6dKjCw8MlSVu3btXrr7+ucuXKqUaNGho7dqwiIyPjvY6bNm3SG2+8ocqVK6t06dJq3ry5Zs6cqaioKIv1Hp2naufOnWrfvr3Kli2rqlWratCgQRbXZtmyZapfv74kafny5fLy8jL/S2pOpqd9X+Pz6PV63KVLl+Tl5WXxvksPv3AeMmSIfH195e3trSpVqqhly5YaOXKkDMOQ9HCurf3795v/P/bf4/vy9/dX//79VatWLXl7e8vHx0fDhw+Pcx89GsuZM2fUp08fVa1a1WK+rfjmfHp0nqvVq1erVatWKlOmjGrVqqURI0aY74dHRUZGaurUqWrQoIFKly6tV155RVOnTtXFixfjPYeEzJgxQ+fOnZOfn58++uijeL+Qd3Jykp+fn8aPH5+sfUrSoUOH1LNnT1WpUkWlS5dW48aNNWHChER7d167dk0DBgxQ1apVVbZsWbVv3167d++Os965c+c0duxY+fn5qWrVquZhJb/55hvdvXs32THGZ9q0afLy8tIXX3yR4LIvv/wyyf1w/78Y9/+jf0emT5+uhg0bqnTp0vL19dWkSZP04MEDi/UjIiI0b948devWTXXr1pW3t7eqV69u7rn5uEd/3/7xxx9q3769ypcv/0Tzvi1dulSS1Lp1a7Vq1UohISFav359vOuGhITo+++/V9OmTVW+fHlVqFBBr7zyigYNGqTLly+b17t//75mzZqlli1bqmLFiipXrpx8fX3Vr18/+fv7x9lvcv6+DB48WEOGDJEkDRkyxOKeelbffvutvLy8tHbt2niXL1myRF5eXpo6daq5LXYuxeT+XpEevr+zZ8+Wn5+fypUrp/Lly6tjx47avHnzM58DAADA84yeZwAAAM+hZcuWKSoqSu3bt1fu3LkTXdfZ2TlO2/Dhw3X06FHVrVtXPj4+ypUrlyTp4MGD6t69ux48eKBGjRrJ3d1df/75p+bOnautW7dq0aJFcnV1TfR448aNM3/x3bx5c2XJkkUBAQE6dOiQdu/erapVq5rXXbBggb766itly5ZNPj4+cnV11bFjxzRlyhTt27dPc+fOjRP/0aNHNX36dNWqVUvt27fXvn37tHDhQoWGhsrX11eDBw9W/fr1Va5cOW3dulUzZ85U5syZ4wxzNW7cOE2bNk158+bVK6+8oqxZs+rgwYMaO3asjh49qgkTJsQ5tz/++ENbt26Vr6+vypcvrwMHDmjFihW6cOGCFi5cKEkqUaKEunbtqrlz56p48eJq0KCBefukhs961vf1WQQEBKht27YKCwtT3bp11bRpU4WFhen8+fNauHChBg0aJEdHR7333ntavny5edjCWCVKlDD//+bNm/XBBx/I3t5e9evX10svvaQzZ85o/vz52rlzp3799Vdlz57d4vj//fef2rVrJ5PJJD8/PwUFBcnJySnJuH/++Wft2LFDvr6+qlatmnbs2KF58+bp9u3bGjdunMW6n3zyiVauXClPT0916tRJERERmjNnjo4cOfJE12rZsmWSpHfffTfJdR0dk5dSrVu3Th9++KGcnZ3VpEkT5cqVS7t27dLkyZO1c+dOzZs3TxkyZLDY5s6dO+rQoYNy5syptm3b6tatW1q3bp26d++uCRMmWNx7v//+u5YuXaqqVauqSpUqio6ONv8sHThwQPPnz0/W9Y5P9+7dtWvXLi1atEh16tQxH/evv/7ShAkTVKxYMXOBIDHc/y/G/R9r5MiROnLkiBo3bqzMmTNry5Ytmjhxok6dOmXx+/POnTv6+uuvValSJdWtW1fZsmXTxYsX9ccff2j79u2aP39+vEMqrl+/Xrt27VK9evXUsWNHhYaGJiuuqKgoLV++XNmzZ5ePj4+8vb01YcIELV26VK1bt7ZY1zAMdevWTUePHlWFChVUu3Zt2dvb6/Lly/rjjz/UqlUr8+/tQYMGad26dfLy8lKbNm3k7Oysa9euad++ffr7779VvHhx836T+/elQYMGCg4O1ubNm1W/fn2L++hZtWvXTtOmTdOSJUvUtGnTOMsXL14sR0fHOPOsPcnvlYiICHXr1k379+9XiRIl9Nprr+nBgwfatm2b3n33XX3++efq3Llzip0TAADAc8UAAADAc6dz586GyWQydu/e/UTbDRo0yDCZTEadOnWMy5cvWyyLiooyGjRoYJhMJmP79u0Wy8aMGWOYTCZjyJAh8cbxqCpVqhi1atUy7t27F+f4t2/fNv//6dOnjZIlSxotW7Y0bt26ZbHe1KlTDZPJZMycOdPctnfvXsNkMhkmk8n4/fffze0RERFGixYtDC8vL6Nq1arG0aNHzctCQkKM6tWrG1WqVDEiIiLM7Tt37jRMJpPx9ttvG3fv3jW3R0dHG1988YVhMpmM9evXm9uXLl1qmEwmo2TJksbBgwfN7ZGRkeZrcOTIEXP7xYsXDZPJZAwaNCjONUjM076vEyZMMEwmk7F3715zW+z1mjBhQpz144tv7ty5hslkMubMmRNn/Ufft0fjjM+tW7eMChUqGLVr1zYuXbpksWzNmjWGyWQyvvrqqzi
"text/plain": [
"<Figure size 1800x1200 with 8 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"============================================================\n",
"Global x Alternative\n",
"============================================================\n",
"\n",
"% par cluster global (chaque ligne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 91.1 1.1 7.8\n",
"Global C1 48.7 10.7 40.5\n",
"Global C2 86.5 6.2 7.3\n",
"Global C3 93.2 0.9 5.9\n",
"\n",
"% par cluster asset (chaque colonne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 41.0 11.0 23.5\n",
"Global C1 9.5 47.9 52.7\n",
"Global C2 21.2 34.6 12.0\n",
"Global C3 28.2 6.5 11.8\n",
"\n",
"============================================================\n",
"Global x Diversified\n",
"============================================================\n",
"\n",
"% par cluster global (chaque ligne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 64.2 33.9 1.8\n",
"Global C1 21.1 61.7 17.2\n",
"Global C2 31.6 40.4 27.9\n",
"Global C3 40.9 54.0 5.1\n",
"\n",
"% par cluster asset (chaque colonne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 54.4 28.5 6.6\n",
"Global C1 7.8 22.5 26.7\n",
"Global C2 14.6 18.5 54.4\n",
"Global C3 23.3 30.5 12.3\n",
"\n",
"============================================================\n",
"Global x Equity\n",
"============================================================\n",
"\n",
"% par cluster global (chaque ligne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1 Asset C2\n",
"Global C0 70.6 25.9 3.2 0.2\n",
"Global C1 18.7 23.1 9.7 48.6\n",
"Global C2 37.4 32.8 29.5 0.3\n",
"Global C3 44.2 49.1 6.8 0.0\n",
"\n",
"% par cluster asset (chaque colonne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1 Asset C2\n",
"Global C0 54.8 29.9 11.6 0.9\n",
"Global C1 6.3 11.5 15.0 98.3\n",
"Global C2 15.8 20.6 57.2 0.9\n",
"Global C3 23.1 38.0 16.2 0.0\n",
"\n",
"============================================================\n",
"Global x Fixed Income\n",
"============================================================\n",
"\n",
"% par cluster global (chaque ligne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 34.5 61.5 4.0\n",
"Global C1 19.3 52.9 27.9\n",
"Global C2 65.4 21.2 13.3\n",
"Global C3 72.0 24.1 3.9\n",
"\n",
"% par cluster asset (chaque colonne somme à 100%) :\n",
" Non exposé Asset C0 Asset C1\n",
"Global C0 27.2 54.8 15.5\n",
"Global C1 6.6 20.4 46.4\n",
"Global C2 28.1 10.3 28.0\n",
"Global C3 38.2 14.4 10.1\n",
"\n",
"============================================================\n",
"Adjusted Rand Index — cohérence global x asset\n",
"============================================================\n",
"(1 = identiques, 0 = aléatoire, <0 = pire qu'aléatoire)\n",
"\n",
" Alternative : ARI=0.0278 (n=1166 comptes communs)\n",
" Diversified : ARI=0.0347 (n=3980 comptes communs)\n",
" Equity : ARI=0.1587 (n=3691 comptes communs)\n",
" Fixed Income : ARI=0.1106 (n=3743 comptes communs)\n",
"\n",
"============================================================\n",
"Exposition multi-asset par cluster global\n",
"============================================================\n",
"\n",
"Nombre moyen d'asset types par cluster global :\n",
"cluster_k4\n",
"0 1.40\n",
"1 2.92\n",
"2 1.79\n",
"3 1.50\n",
"Name: n_asset_types, dtype: float64\n",
"\n",
"Distribution du nombre d'asset types par cluster global :\n",
" 0 asset(s) 1 asset(s) 2 asset(s) 3 asset(s) 4 asset(s)\n",
"Global C0 0.4 73.9 14.8 7.3 3.5\n",
"Global C1 0.8 17.5 13.6 25.0 43.1\n",
"Global C2 0.0 49.2 29.3 14.8 6.7\n",
"Global C3 0.0 64.7 23.3 9.6 2.4\n"
]
}
],
"source": [
"# ============================================================\n",
"# CROISEMENT — clustering global x clustering par asset type\n",
"# ============================================================\n",
"\n",
"# ── 1. Merge des labels asset dans dfc ────────────────────────────────────\n",
"dfc_cross = dfc[[ID_COL, \"cluster_k4\"]].copy()\n",
"\n",
"for asset, res in ASSET_RESULTS.items():\n",
" cluster_col = res[\"cluster_col\"]\n",
" df_a = res[\"df\"][[ID_COL, cluster_col]].copy()\n",
" dfc_cross = dfc_cross.merge(df_a, on=ID_COL, how=\"left\")\n",
"\n",
"print(\"Colonnes disponibles :\", dfc_cross.columns.tolist())\n",
"print(\"Shape :\", dfc_cross.shape)\n",
"\n",
"# ── 2. Tables de contingence global x asset ───────────────────────────────\n",
"fig, axes = plt.subplots(2, 2, figsize=(18, 12))\n",
"axes = axes.flatten()\n",
"\n",
"for i, (asset, res) in enumerate(ASSET_RESULTS.items()):\n",
" cluster_col = res[\"cluster_col\"]\n",
"\n",
" if cluster_col not in dfc_cross.columns:\n",
" continue\n",
"\n",
" # Table de contingence normalisée par ligne (% par cluster global)\n",
" ct = pd.crosstab(\n",
" dfc_cross[\"cluster_k4\"],\n",
" dfc_cross[cluster_col].fillna(-1).astype(int),\n",
" normalize=\"index\"\n",
" ).round(3) * 100\n",
"\n",
" # Renommer les colonnes\n",
" col_names = {\n",
" c: f\"Asset C{c}\" if c >= 0 else \"Non exposé\"\n",
" for c in ct.columns\n",
" }\n",
" ct = ct.rename(columns=col_names)\n",
" ct.index = [f\"Global C{i}\" for i in ct.index]\n",
"\n",
" sns.heatmap(\n",
" ct,\n",
" cmap=\"Blues\",\n",
" annot=True,\n",
" fmt=\".1f\",\n",
" ax=axes[i],\n",
" cbar_kws={\"label\": \"%\"},\n",
" vmin=0,\n",
" vmax=100,\n",
" )\n",
" axes[i].set_title(f\"Global x {asset} (% par cluster global)\")\n",
" axes[i].set_xlabel(f\"Cluster {asset}\")\n",
" axes[i].set_ylabel(\"Cluster Global\")\n",
"\n",
"plt.suptitle(\"Croisement Clustering Global x Clustering par Asset Type\",\n",
" fontsize=14, y=1.02)\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# ── 3. Tables de contingence détaillées ───────────────────────────────────\n",
"for asset, res in ASSET_RESULTS.items():\n",
" cluster_col = res[\"cluster_col\"]\n",
" if cluster_col not in dfc_cross.columns:\n",
" continue\n",
"\n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"Global x {asset}\")\n",
" print(f\"{'='*60}\")\n",
"\n",
" # % par cluster global\n",
" ct_row = pd.crosstab(\n",
" dfc_cross[\"cluster_k4\"],\n",
" dfc_cross[cluster_col].fillna(-1).astype(int),\n",
" normalize=\"index\"\n",
" ).round(3) * 100\n",
" ct_row.index = [f\"Global C{i}\" for i in ct_row.index]\n",
" ct_row.columns = [f\"Asset C{c}\" if c >= 0 else \"Non exposé\"\n",
" for c in ct_row.columns]\n",
" print(\"\\n% par cluster global (chaque ligne somme à 100%) :\")\n",
" print(ct_row.to_string())\n",
"\n",
" # % par cluster asset\n",
" ct_col = pd.crosstab(\n",
" dfc_cross[\"cluster_k4\"],\n",
" dfc_cross[cluster_col].fillna(-1).astype(int),\n",
" normalize=\"columns\"\n",
" ).round(3) * 100\n",
" ct_col.index = [f\"Global C{i}\" for i in ct_col.index]\n",
" ct_col.columns = [f\"Asset C{c}\" if c >= 0 else \"Non exposé\"\n",
" for c in ct_col.columns]\n",
" print(\"\\n% par cluster asset (chaque colonne somme à 100%) :\")\n",
" print(ct_col.to_string())\n",
"\n",
"# ── 4. Indice de Rand Ajusté — mesure de cohérence ────────────────────────\n",
"from sklearn.metrics import adjusted_rand_score\n",
"\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"Adjusted Rand Index — cohérence global x asset\")\n",
"print(\"=\"*60)\n",
"print(\"(1 = identiques, 0 = aléatoire, <0 = pire qu'aléatoire)\\n\")\n",
"\n",
"for asset, res in ASSET_RESULTS.items():\n",
" cluster_col = res[\"cluster_col\"]\n",
" if cluster_col not in dfc_cross.columns:\n",
" continue\n",
"\n",
" # Garder seulement les comptes présents dans les deux clusterings\n",
" mask = dfc_cross[cluster_col].notna()\n",
" labels_global = dfc_cross.loc[mask, \"cluster_k4\"].values\n",
" labels_asset = dfc_cross.loc[mask, cluster_col].values\n",
"\n",
" ari = adjusted_rand_score(labels_global, labels_asset)\n",
" n_common = mask.sum()\n",
" print(f\" {asset:20s} : ARI={ari:.4f} (n={n_common} comptes communs)\")\n",
"\n",
"# ── 5. Profil des comptes présents dans plusieurs asset types ─────────────\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"Exposition multi-asset par cluster global\")\n",
"print(\"=\"*60)\n",
"\n",
"# Compter le nombre d'asset types par compte\n",
"asset_cols = [res[\"cluster_col\"] for res in ASSET_RESULTS.values()\n",
" if res[\"cluster_col\"] in dfc_cross.columns]\n",
"dfc_cross[\"n_asset_types\"] = dfc_cross[asset_cols].notna().sum(axis=1)\n",
"\n",
"print(\"\\nNombre moyen d'asset types par cluster global :\")\n",
"print(dfc_cross.groupby(\"cluster_k4\")[\"n_asset_types\"].mean().round(2))\n",
"\n",
"print(\"\\nDistribution du nombre d'asset types par cluster global :\")\n",
"ct_multi = pd.crosstab(\n",
" dfc_cross[\"cluster_k4\"],\n",
" dfc_cross[\"n_asset_types\"],\n",
" normalize=\"index\"\n",
").round(3) * 100\n",
"ct_multi.index = [f\"Global C{i}\" for i in ct_multi.index]\n",
"ct_multi.columns = [f\"{c} asset(s)\" for c in ct_multi.columns]\n",
"print(ct_multi.to_string())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b78c0c1c-86ff-4b65-b592-57caf37c15a5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}