Project_Carmignac/.ipynb_checkpoints/global_clustering-checkpoint.ipynb

1285 lines
670 KiB
Plaintext
Raw Permalink Normal View History

2026-04-14 13:22:59 +02:00
{
"cells": [
{
"cell_type": "markdown",
"id": "13c6141d",
"metadata": {},
"source": [
"# Global Clustering of Carmignac Investors\n",
"\n",
"1. Imports & Configuration\n",
"2. Data Loading\n",
"3. Monthly Panel Construction\n",
"4. Feature Engineering\n",
"5. Global Clustering (all accounts)\n",
" - 5a. Feature selection & preprocessing\n",
" - 5b. K-selection & clustering\n",
" - 5c. Cluster profiles (behavioral + allocation)\n",
" - 5d. Asset-type sub-clustering & cross-analysis\n"
]
},
{
"cell_type": "markdown",
"id": "28e588fe",
"metadata": {},
"source": [
"---\n",
"## 1. Imports & Configuration\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3bc1ffe0",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'UMMV3Z72A70MCCSRV17O'\n",
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'wBFxaez78UPNW3BtchZOf4f238ZNXKnCexeGufaa'\n",
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJVTU1WM1o3MkE3ME1DQ1NSVjE3TyIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc3NTEzNTA4NiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NzYzNDQ3NDksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc3NTEzNTE0OCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJlZGY1ZDQ1OC1hYzkxLTQ5NTAtYmI5Ny0zNjMwNWY1MTQwYTIiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjMzMjg4YjJjLTlhMjAtNDNhOS1iMDlhLTdlMjc1OWQ1NjIxNiIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.rffoTJijRiGK2DCDhXj5y8R31DRH1LWkTwuH_1lvU9qN_xJSTmBIM4uGR_zp7XpMnq_ePwVhlkoWN15cNUgjMA'\n",
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
"\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy import stats\n",
"\n",
"from sklearn.preprocessing import RobustScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import (\n",
" silhouette_score, davies_bouldin_score,\n",
" pairwise_distances, adjusted_rand_score\n",
")\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"sns.set_style(\"whitegrid\")\n",
"pd.set_option(\"display.max_columns\", 200)\n",
"pd.set_option(\"display.max_rows\", 200)\n",
"\n",
"EPS = 1e-9\n",
"RANDOM_STATE = 42"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "69d2dc25",
"metadata": {},
"outputs": [],
"source": [
"# Column names\n",
"ID_COL = \"Registrar Account - ID\"\n",
"ISIN_COL = \"Product - Isin\"\n",
"FUND_COL = \"Product - Fund\"\n",
"ASSET_COL = \"Product - Asset Type\"\n",
"FLOW_DATE_COL = \"Centralisation Date\"\n",
"AUM_DATE_COL = \"Centralisation Date\"\n",
"FLOW_QTY_COL = \"Quantity - NetFlows\"\n",
"FLOW_SUB_COL = \"Quantity - Subscription\"\n",
"FLOW_RED_COL = \"Quantity - Redemption\"\n",
"AUM_QTY_COL = \"Quantity - AUM\"\n",
"AUM_VAL_COL = \"Value - AUM €\"\n",
"REGION_COL = \"Registrar Account - Region\"\n",
"COUNTRY_COL = \"RegistrarAccount - Country\"\n",
"NAV_DATE_COL = \"Dat\"\n",
"NAV_ISIN_COL = \"Isin\"\n",
"NAV_PRICE_COL = \"Price (TF PartPrice)\"\n",
"NAV_BENCH_COL = \"PriceBench\"\n",
"RATE_DATE_COL = \"Date\"\n",
"RATE_VAL_COL = \"Yld to Maturity\"\n",
"\n",
"PATH_NAV = \"s3://projet-bdc-data/carmignac/Data Modélisation/Nav/NAV_Bench_data.csv\"\n",
"PATH_RATES = \"s3://projet-bdc-data/carmignac/Data Modélisation/market data/esterRates.csv\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "bf5b7a0a",
"metadata": {},
"outputs": [],
"source": [
"# UTILITIES\n",
"def robust_zscore(s):\n",
" med = np.nanmedian(s)\n",
" mad = np.nanmedian(np.abs(s - med))\n",
" if mad == 0 or np.isnan(mad):\n",
" return np.zeros(len(s))\n",
" return (s - med) / (1.4826 * mad)\n",
"\n",
"def plot_heatmap(dfc, profile_vars, cluster_col, title, figsize=(16, 4)):\n",
" \"\"\"Cluster signature heatmap using robust z-scores, capped at ±3 for readability.\"\"\"\n",
" dfc_viz = dfc[profile_vars + [cluster_col]].copy()\n",
" for col in profile_vars:\n",
" vals = pd.to_numeric(dfc_viz[col], errors=\"coerce\").to_numpy(dtype=float)\n",
" lo = np.nanpercentile(vals, 2)\n",
" hi = np.nanpercentile(vals, 98)\n",
" dfc_viz[col] = np.clip(vals, lo, hi)\n",
" prof = dfc_viz.groupby(cluster_col)[profile_vars].median()\n",
" prof_z = prof.apply(lambda col: robust_zscore(col.values), axis=0)\n",
" prof_z = prof_z.clip(-3, 3) # cap for readability\n",
" plt.figure(figsize=figsize)\n",
" sns.heatmap(prof_z, cmap=\"RdBu_r\", center=0, annot=True, fmt=\".2f\",\n",
" xticklabels=profile_vars,\n",
" yticklabels=[f\"Cluster {i}\" for i in range(len(prof))])\n",
" plt.title(title)\n",
" plt.xticks(rotation=45, ha=\"right\")\n",
" plt.tight_layout()\n",
" plt.show()\n",
" return prof\n",
"\n",
"def winsorize_mad(series, n_sigma=3):\n",
" \"\"\"Winsorize using MAD n-sigma rule. Falls back to p95 clip when MAD~0.\"\"\"\n",
" vals = pd.to_numeric(series, errors=\"coerce\").to_numpy(dtype=float)\n",
" med = np.nanmedian(vals)\n",
" mad = np.nanmedian(np.abs(vals - med)) * 1.4826\n",
" if mad > 0:\n",
" return np.clip(vals, med - n_sigma * mad, med + n_sigma * mad)\n",
" else:\n",
" return np.clip(vals, 0, np.nanpercentile(vals, 95))\n",
"\n",
"def add_months_since_last_tx(dfc, df_month, id_col, suffix=\"\"):\n",
" \"\"\"Adds months_since_last_tx[suffix] to dfc.\"\"\"\n",
" col_name = f\"months_since_last_tx{suffix}\"\n",
" reference_date = df_month[\"month\"].max()\n",
" last_active = (\n",
" df_month[df_month[\"active_month\"] == 1]\n",
" .groupby(id_col)[\"month\"]\n",
" .max()\n",
" .reset_index(name=\"last_active_month\")\n",
" )\n",
" last_active[col_name] = (\n",
" (reference_date.to_period(\"M\") -\n",
" last_active[\"last_active_month\"].dt.to_period(\"M\"))\n",
" .apply(lambda x: x.n)\n",
" )\n",
" dfc = dfc.merge(last_active[[id_col, col_name]], on=id_col, how=\"left\")\n",
" max_months = dfc[col_name].max()\n",
" dfc[col_name] = dfc[col_name].fillna(max_months + 1)\n",
" return dfc\n",
"\n",
"def plot_heatmap_annotated(dfc, profile_vars, cluster_col, title, figsize=(16, 5)):\n",
" \"\"\"\n",
" Heatmap colored by robust z-score, annotated with actual cluster medians.\n",
" \"\"\"\n",
" # Real medians\n",
" prof_median = dfc.groupby(cluster_col)[profile_vars].median()\n",
"\n",
" # Robust z-scores for color\n",
" prof_z = prof_median.copy().astype(float)\n",
" for col in profile_vars:\n",
" vals = prof_median[col].values\n",
" med = np.median(vals)\n",
" mad = np.median(np.abs(vals - med)) * 1.4826\n",
" if mad > 0:\n",
" prof_z[col] = (vals - med) / mad\n",
" else:\n",
" prof_z[col] = np.zeros(len(vals))\n",
" prof_z = prof_z.clip(-3, 3)\n",
"\n",
" # Format annotations with actual medians\n",
" def fmt(val):\n",
" if abs(val) >= 1000:\n",
" return f\"{val:,.0f}\"\n",
" elif abs(val) >= 10:\n",
" return f\"{val:.1f}\"\n",
" elif abs(val) >= 0.01:\n",
" return f\"{val:.2f}\"\n",
" else:\n",
" return f\"{val:.3f}\"\n",
"\n",
" annot = prof_median.applymap(fmt)\n",
"\n",
" # Row labels: Cluster 1, 2, ... (1-indexed) with cluster size\n",
" cluster_sizes = dfc[cluster_col].value_counts().sort_index()\n",
" row_labels = [\n",
" f\"Cluster {i} (n={cluster_sizes.get(i, '?'):,})\"\n",
" for i in prof_median.index\n",
" ]\n",
"\n",
" fig, ax = plt.subplots(figsize=figsize)\n",
" sns.heatmap(\n",
" prof_z,\n",
" cmap=\"RdBu_r\",\n",
" center=0,\n",
" annot=annot,\n",
" fmt=\"\",\n",
" linewidths=0.5,\n",
" linecolor=\"white\",\n",
" ax=ax,\n",
" cbar_kws={\"label\": \"Robust z-score\", \"shrink\": 0.8},\n",
" xticklabels=profile_vars,\n",
" yticklabels=row_labels,\n",
" )\n",
" ax.set_title(title, fontsize=13, pad=12)\n",
" ax.tick_params(axis=\"x\", rotation=45, labelsize=9)\n",
" ax.tick_params(axis=\"y\", rotation=0, labelsize=9)\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" return prof_median"
]
},
{
"cell_type": "markdown",
"id": "312153e6",
"metadata": {},
"source": [
"---\n",
"## 2. Data Loading\n",
"\n",
"Three data sources are used:\n",
"- **AUM** (repaired): monthly share quantities per account and ISIN\n",
"- **Flows**: daily net transactions, aggregated to monthly\n",
"- **NAV / Rates**: fund performance and interest rate data for enrichment\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "011958df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"flows: (2574461, 26)\n",
"aum: (4824814, 19)\n",
"nav: (623914, 6)\n"
]
}
],
"source": [
"df_flows = pd.read_csv(\"flows.csv\", low_memory=False)\n",
"df_aum = pd.read_csv(\n",
" \"s3://projet-bdc-carmignac-g3/paco/AUM_repaired.csv\", low_memory=False\n",
")\n",
"df_nav = pd.read_csv(PATH_NAV, sep=\";\")\n",
"df_rates = pd.read_csv(PATH_RATES, sep=\";\")\n",
"\n",
"# Date parsing\n",
"for df, col in [\n",
" (df_flows, FLOW_DATE_COL), (df_aum, AUM_DATE_COL),\n",
" (df_nav, NAV_DATE_COL), (df_rates, RATE_DATE_COL)\n",
"]:\n",
" df[col] = pd.to_datetime(df[col], errors=\"coerce\")\n",
" df[\"month\"] = df[col].dt.to_period(\"M\").dt.to_timestamp()\n",
"\n",
"for col in [FLOW_QTY_COL, FLOW_SUB_COL, FLOW_RED_COL]:\n",
" df_flows[col] = pd.to_numeric(df_flows[col], errors=\"coerce\")\n",
"for col in [AUM_QTY_COL, AUM_VAL_COL]:\n",
" df_aum[col] = pd.to_numeric(df_aum[col], errors=\"coerce\")\n",
"for col in [NAV_PRICE_COL, NAV_BENCH_COL]:\n",
" df_nav[col] = pd.to_numeric(df_nav[col], errors=\"coerce\")\n",
"df_rates[RATE_VAL_COL] = pd.to_numeric(df_rates[RATE_VAL_COL], errors=\"coerce\")\n",
"\n",
"for df in [df_flows, df_aum]:\n",
" df[ISIN_COL] = df[ISIN_COL].astype(str).str.strip()\n",
"df_nav[NAV_ISIN_COL] = df_nav[NAV_ISIN_COL].astype(str).str.strip()\n",
"\n",
"# Remove technical accounts (not investable)\n",
"df_flows = df_flows[~df_flows[ID_COL].isin(\n",
" [\"Off Distribution\", \"Private Clients\", \"Private Client\"]\n",
")]\n",
"df_aum = df_aum[~df_aum[ID_COL].isin(\n",
" [\"Off Distribution\", \"Private Clients\", \"Private Client\"]\n",
")]\n",
"\n",
"print(\"flows:\", df_flows.shape)\n",
"print(\"aum: \", df_aum.shape)\n",
"print(\"nav: \", df_nav.shape)"
]
},
{
"cell_type": "markdown",
"id": "d34f5ecf",
"metadata": {},
"source": [
"---\n",
"## 3. Monthly Panel Construction\n",
"\n",
"A full outer join of AUM and flows at `(account, ISIN, month)` granularity, enriched with NAV returns and interest rate changes.\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "25f3dce4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Panel shape: (4754355, 24)\n"
]
}
],
"source": [
"df_flows_m = (\n",
" df_flows\n",
" .dropna(subset=[ID_COL, ISIN_COL, \"month\"])\n",
" .assign(\n",
" gross_flow_qty = lambda x: x[FLOW_QTY_COL].abs(),\n",
" sub_qty = lambda x: x[FLOW_SUB_COL].fillna(0),\n",
" red_qty = lambda x: x[FLOW_RED_COL].fillna(0)\n",
" )\n",
" .groupby([ID_COL, ISIN_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" net_flow_qty = (FLOW_QTY_COL, \"sum\"),\n",
" gross_flow_qty = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty = (\"sub_qty\", \"sum\"),\n",
" red_qty = (\"red_qty\", \"sum\"),\n",
" n_tx = (FLOW_QTY_COL, \"size\"),\n",
" region = (REGION_COL, \"last\"),\n",
" country = (COUNTRY_COL, \"last\")\n",
" )\n",
")\n",
"\n",
"df_aum_m = (\n",
" df_aum\n",
" .dropna(subset=[ID_COL, ISIN_COL, \"month\"])\n",
" .groupby([ID_COL, ISIN_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" aum_qty = (AUM_QTY_COL, \"sum\"),\n",
" aum_val = (AUM_VAL_COL, \"sum\"),\n",
" fund = (FUND_COL, \"last\"),\n",
" asset_type = (ASSET_COL, \"last\"),\n",
" region = (REGION_COL, \"last\"),\n",
" country = (COUNTRY_COL, \"last\")\n",
" )\n",
")\n",
"\n",
"keys = pd.concat([\n",
" df_flows_m[[ID_COL, ISIN_COL, \"month\"]],\n",
" df_aum_m[[ID_COL, ISIN_COL, \"month\"]]\n",
"]).drop_duplicates()\n",
"\n",
"df_rel_m = (\n",
" keys\n",
" .merge(df_aum_m, on=[ID_COL, ISIN_COL, \"month\"], how=\"left\")\n",
" .merge(df_flows_m, on=[ID_COL, ISIN_COL, \"month\"], how=\"left\",\n",
" suffixes=(\"\", \"_flow\"))\n",
")\n",
"\n",
"for c in [\"aum_qty\",\"aum_val\",\"net_flow_qty\",\"gross_flow_qty\",\n",
" \"sub_qty\",\"red_qty\",\"n_tx\"]:\n",
" df_rel_m[c] = df_rel_m[c].fillna(0)\n",
"\n",
"df_rel_m[\"region\"] = df_rel_m[\"region\"].fillna(df_rel_m.get(\"region_flow\"))\n",
"df_rel_m[\"country\"] = df_rel_m[\"country\"].fillna(df_rel_m.get(\"country_flow\"))\n",
"df_rel_m[\"active_rel_month\"] = (df_rel_m[\"gross_flow_qty\"] > 0).astype(int)\n",
"df_rel_m[\"holding_rel_month\"] = (df_rel_m[\"aum_qty\"] > 0).astype(int)\n",
"df_rel_m[\"flow_to_aum_rel\"] = df_rel_m[\"net_flow_qty\"] / (df_rel_m[\"aum_qty\"].abs() + EPS)\n",
"df_rel_m[\"turnover_rel\"] = df_rel_m[\"gross_flow_qty\"] / (df_rel_m[\"aum_qty\"].abs() + EPS)\n",
"\n",
"# --- NAV returns & interest rates ---\n",
"df_nav_m = (\n",
" df_nav\n",
" .dropna(subset=[NAV_ISIN_COL, \"month\", NAV_PRICE_COL])\n",
" .sort_values([NAV_ISIN_COL, \"month\"])\n",
" .groupby([NAV_ISIN_COL, \"month\"], as_index=False).tail(1).copy()\n",
")\n",
"df_nav_m[\"ret_fund_m\"] = df_nav_m.groupby(NAV_ISIN_COL)[NAV_PRICE_COL].pct_change()\n",
"df_nav_m[\"ret_bench_m\"] = df_nav_m.groupby(NAV_ISIN_COL)[NAV_BENCH_COL].pct_change()\n",
"df_nav_m[\"active_return_m\"] = df_nav_m[\"ret_fund_m\"] - df_nav_m[\"ret_bench_m\"]\n",
"df_nav_m = df_nav_m.rename(columns={NAV_ISIN_COL: ISIN_COL})[\n",
" [ISIN_COL, \"month\", \"ret_fund_m\", \"ret_bench_m\", \"active_return_m\"]\n",
"]\n",
"\n",
"df_rates_m = (\n",
" df_rates.dropna(subset=[\"month\", RATE_VAL_COL])\n",
" .sort_values(RATE_DATE_COL)\n",
" .groupby(\"month\", as_index=False).tail(1).copy()\n",
")\n",
"df_rates_m[\"delta_rate_m\"] = df_rates_m[RATE_VAL_COL].diff()\n",
"df_rates_m = df_rates_m[[\"month\", RATE_VAL_COL, \"delta_rate_m\"]]\n",
"\n",
"df_rel_m = df_rel_m.merge(df_nav_m, on=[ISIN_COL, \"month\"], how=\"left\")\n",
"df_rel_m = df_rel_m.merge(\n",
" df_rates_m[[\"month\", \"delta_rate_m\"]], on=\"month\", how=\"left\"\n",
")\n",
"for c in [\"ret_fund_m\",\"ret_bench_m\",\"active_return_m\",\"delta_rate_m\"]:\n",
" df_rel_m[c] = df_rel_m[c].fillna(0)\n",
"\n",
"print(\"Panel shape:\", df_rel_m.shape)"
]
},
{
"cell_type": "markdown",
"id": "9121da21",
"metadata": {},
"source": [
"---\n",
"## 4. Feature Engineering\n",
"\n",
"Features are built at three levels of granularity:\n",
"- **Account × month**: activity flags, turnover, drawdown\n",
"- **Account × ISIN**: entry/exit events, holding duration, performance reactivity\n",
"- **Account (static)**: aggregated behavioral summary used for clustering\n",
"\n",
"Asset type and fund composition shares are computed separately and used as **descriptive** post-clustering variables only.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d4a01bcc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Monthly panel shape: (931089, 22)\n",
"ISIN-level client features: (12582, 12)\n",
"Asset shares: (7473, 6)\n",
"Fund shares: (6591, 11)\n",
"df_client_base shape: (12582, 47)\n"
]
}
],
"source": [
"# 4a. Monthly account-level panel\n",
"tmp = df_rel_m.copy()\n",
"tmp[\"isin_held_flag\"] = (tmp[\"aum_qty\"] > 0).astype(int)\n",
"tmp[\"isin_active_flag\"] = (tmp[\"gross_flow_qty\"] > 0).astype(int)\n",
"\n",
"df_month = (\n",
" tmp.groupby([ID_COL, \"month\"], as_index=False)\n",
" .agg(\n",
" aum_qty = (\"aum_qty\", \"sum\"),\n",
" aum_val = (\"aum_val\", \"sum\"),\n",
" net_flow_qty = (\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty = (\"gross_flow_qty\", \"sum\"),\n",
" sub_qty = (\"sub_qty\", \"sum\"),\n",
" red_qty = (\"red_qty\", \"sum\"),\n",
" n_tx = (\"n_tx\", \"sum\"),\n",
" n_isin_held = (\"isin_held_flag\", \"sum\"),\n",
" n_isin_active = (\"isin_active_flag\", \"sum\"),\n",
" delta_rate_m = (\"delta_rate_m\", \"first\"),\n",
" ret_fund_m = (\"ret_fund_m\", \"mean\"),\n",
" region = (\"region\", \"first\"),\n",
" country = (\"country\", \"first\"),\n",
" )\n",
" .sort_values([ID_COL, \"month\"])\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"df_month[\"active_month\"] = (df_month[\"gross_flow_qty\"] > 0).astype(int)\n",
"df_month[\"flow_to_aum_m\"] = np.where(\n",
" df_month[\"aum_qty\"].abs() > 0,\n",
" df_month[\"net_flow_qty\"] / df_month[\"aum_qty\"].abs(), np.nan\n",
")\n",
"df_month[\"turnover_m\"] = np.where(\n",
" df_month[\"aum_qty\"].abs() > 0,\n",
" df_month[\"gross_flow_qty\"] / df_month[\"aum_qty\"].abs(), np.nan\n",
")\n",
"df_month[\"sub_share_m\"] = np.where(\n",
" df_month[\"gross_flow_qty\"] > 0,\n",
" df_month[\"sub_qty\"] / df_month[\"gross_flow_qty\"], np.nan\n",
")\n",
"df_month[\"red_share_m\"] = np.where(\n",
" df_month[\"gross_flow_qty\"] > 0,\n",
" df_month[\"red_qty\"] / df_month[\"gross_flow_qty\"], np.nan\n",
")\n",
"df_month[\"aum_peak_to_date\"] = df_month.groupby(ID_COL)[\"aum_qty\"].cummax()\n",
"df_month[\"aum_drawdown\"] = np.where(\n",
" df_month[\"aum_peak_to_date\"] > 0,\n",
" 1 - df_month[\"aum_qty\"] / df_month[\"aum_peak_to_date\"], np.nan\n",
")\n",
"\n",
"print(\"Monthly panel shape:\", df_month.shape)\n",
"\n",
"# 4b. ISIN-level features (entry/exit, performance reactivity)\n",
"tmp = df_rel_m.sort_values([ID_COL, ISIN_COL, \"month\"]).copy()\n",
"tmp[\"prev_aum\"] = tmp.groupby([ID_COL, ISIN_COL])[\"aum_qty\"].shift(1)\n",
"tmp[\"entry_event\"] = ((tmp[\"prev_aum\"].fillna(0) <= 0) & (tmp[\"aum_qty\"] > 0)).astype(int)\n",
"tmp[\"full_exit_event\"] = ((tmp[\"prev_aum\"] > 0) & (tmp[\"aum_qty\"] <= 0)).astype(int)\n",
"tmp[\"ret_fund_m_lag1\"] = tmp.groupby([ID_COL, ISIN_COL])[\"ret_fund_m\"].shift(1)\n",
"tmp[\"buy_on_perf\"] = ((tmp[\"net_flow_qty\"] > 0) & (tmp[\"ret_fund_m_lag1\"] > 0)).astype(int)\n",
"tmp[\"sell_on_perf\"] = ((tmp[\"net_flow_qty\"] < 0) & (tmp[\"ret_fund_m_lag1\"] < 0)).astype(int)\n",
"\n",
"df_rel_feat = (\n",
" tmp.groupby([ID_COL, ISIN_COL], as_index=False)\n",
" .agg(\n",
" rel_n_months = (\"month\", \"nunique\"),\n",
" rel_active_months = (\"active_rel_month\", \"sum\"),\n",
" rel_holding_months = (\"holding_rel_month\", \"sum\"),\n",
" rel_aum_mean = (\"aum_qty\", \"mean\"),\n",
" rel_turnover_mean = (\"turnover_rel\", \"mean\"),\n",
" rel_turnover_vol = (\"turnover_rel\", \"std\"),\n",
" rel_flow_to_aum_vol = (\"flow_to_aum_rel\", \"std\"),\n",
" rel_n_tx = (\"n_tx\", \"sum\"),\n",
" rel_full_exit_count = (\"full_exit_event\", \"sum\"),\n",
" rel_entry_count = (\"entry_event\", \"sum\"),\n",
" buy_on_perf_rate = (\"buy_on_perf\", \"mean\"),\n",
" sell_on_perf_rate = (\"sell_on_perf\", \"mean\"),\n",
" )\n",
")\n",
"\n",
"isin_aum = df_rel_feat.groupby(ID_COL)[\"rel_aum_mean\"].transform(\"sum\")\n",
"df_rel_feat[\"isin_weight\"] = np.where(\n",
" isin_aum > 0, df_rel_feat[\"rel_aum_mean\"] / isin_aum, np.nan\n",
")\n",
"hhi_isin = (\n",
" df_rel_feat.groupby(ID_COL)[\"isin_weight\"]\n",
" .apply(lambda w: np.sum(w**2))\n",
" .reset_index(name=\"hhi_isin\")\n",
")\n",
"\n",
"df_rel_client = (\n",
" df_rel_feat.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" n_isin_total = (ISIN_COL, \"nunique\"),\n",
" rel_turnover_mean_avg = (\"rel_turnover_mean\", \"mean\"),\n",
" rel_turnover_vol_avg = (\"rel_turnover_vol\", \"mean\"),\n",
" rel_flow_to_aum_vol_avg = (\"rel_flow_to_aum_vol\", \"mean\"), \n",
" full_exit_count = (\"rel_full_exit_count\", \"sum\"),\n",
" entry_count = (\"rel_entry_count\", \"sum\"),\n",
" avg_holding_months_per_isin = (\"rel_holding_months\", \"mean\"),\n",
" max_holding_months_per_isin = (\"rel_holding_months\", \"max\"),\n",
" buy_on_perf_rate_avg = (\"buy_on_perf_rate\", \"mean\"),\n",
" sell_on_perf_rate_avg = (\"sell_on_perf_rate\", \"mean\"),\n",
" )\n",
" .merge(hhi_isin, on=ID_COL, how=\"left\")\n",
")\n",
"\n",
"print(\"ISIN-level client features:\", df_rel_client.shape)\n",
"\n",
"# 4c. Asset type & fund composition shares\n",
"aum_by_asset = (\n",
" df_aum.dropna(subset=[ID_COL, ASSET_COL])\n",
" .groupby([ID_COL, ASSET_COL], as_index=False)[AUM_VAL_COL].sum()\n",
")\n",
"total_aum_acc = aum_by_asset.groupby(ID_COL)[AUM_VAL_COL].sum().rename(\"total_aum\")\n",
"aum_by_asset = aum_by_asset.merge(total_aum_acc, on=ID_COL)\n",
"aum_by_asset[\"share\"] = np.where(\n",
" aum_by_asset[\"total_aum\"] > 0,\n",
" aum_by_asset[AUM_VAL_COL] / aum_by_asset[\"total_aum\"], np.nan\n",
")\n",
"asset_shares = (\n",
" aum_by_asset\n",
" .pivot_table(index=ID_COL, columns=ASSET_COL, values=\"share\", aggfunc=\"mean\")\n",
" .fillna(0).reset_index()\n",
")\n",
"asset_shares.columns = [ID_COL] + [\n",
" f\"share_asset_{c.lower().replace(' ','_')}\" for c in asset_shares.columns[1:]\n",
"]\n",
"\n",
"aum_by_fund = (\n",
" df_aum.dropna(subset=[ID_COL, FUND_COL])\n",
" .groupby([ID_COL, FUND_COL], as_index=False)[AUM_VAL_COL].sum()\n",
")\n",
"aum_by_fund = aum_by_fund.merge(total_aum_acc, on=ID_COL)\n",
"aum_by_fund[\"share\"] = np.where(\n",
" aum_by_fund[\"total_aum\"] > 0,\n",
" aum_by_fund[AUM_VAL_COL] / aum_by_fund[\"total_aum\"], np.nan\n",
")\n",
"top_funds = aum_by_fund.groupby(FUND_COL)[AUM_VAL_COL].sum().nlargest(10).index\n",
"fund_shares = (\n",
" aum_by_fund[aum_by_fund[FUND_COL].isin(top_funds)]\n",
" .pivot_table(index=ID_COL, columns=FUND_COL, values=\"share\", aggfunc=\"mean\")\n",
" .fillna(0).reset_index()\n",
")\n",
"fund_shares.columns = [ID_COL] + [\n",
" f\"share_fund_{c.lower().replace(' ','_')[:30]}\" for c in fund_shares.columns[1:]\n",
"]\n",
"\n",
"print(\"Asset shares:\", asset_shares.shape)\n",
"print(\"Fund shares: \", fund_shares.shape)\n",
"\n",
"# 4d. Static client-level features\n",
"df_client_base = (\n",
" df_month.groupby(ID_COL, as_index=False)\n",
" .agg(\n",
" n_months = (\"month\", \"nunique\"),\n",
" n_active_months = (\"active_month\", \"sum\"),\n",
" flow_freq = (\"active_month\", \"mean\"),\n",
" aum_qty_mean = (\"aum_qty\", \"mean\"),\n",
" aum_qty_median = (\"aum_qty\", \"median\"),\n",
" aum_qty_max = (\"aum_qty\", \"max\"),\n",
" aum_qty_last = (\"aum_qty\", \"last\"),\n",
" net_flow_qty_sum = (\"net_flow_qty\", \"sum\"),\n",
" gross_flow_qty_sum = (\"gross_flow_qty\", \"sum\"),\n",
" gross_flow_qty_mean= (\"gross_flow_qty\", \"mean\"),\n",
" sub_qty_sum = (\"sub_qty\", \"sum\"),\n",
" red_qty_sum = (\"red_qty\", \"sum\"),\n",
" n_tx_total = (\"n_tx\", \"sum\"),\n",
" avg_n_isin_held = (\"n_isin_held\", \"mean\"),\n",
" max_n_isin_held = (\"n_isin_held\", \"max\"),\n",
" net_flow_qty_vol = (\"net_flow_qty\", \"std\"),\n",
" aum_drawdown_last = (\"aum_drawdown\", \"last\"),\n",
" aum_drawdown_max = (\"aum_drawdown\", \"max\"),\n",
" region = (\"region\", \"last\"),\n",
" country = (\"country\", \"last\"),\n",
" )\n",
")\n",
"df_client_base[\"net_flow_qty_vol\"] = df_client_base[\"net_flow_qty_vol\"].fillna(0)\n",
"\n",
"df_client_base = (\n",
" df_client_base\n",
" .merge(df_rel_client, on=ID_COL, how=\"left\")\n",
" .merge(asset_shares, on=ID_COL, how=\"left\")\n",
" .merge(fund_shares, on=ID_COL, how=\"left\")\n",
")\n",
"\n",
"print(\"df_client_base shape:\", df_client_base.shape)"
]
},
{
"cell_type": "markdown",
"id": "c383042d",
"metadata": {},
"source": [
"---\n",
"## 5. Global Clustering (All Accounts)\n",
"\n",
"### Objective\n",
"Segment the full client base into behavioral profiles using 8 selected features. The analysis covers ~7,000 accounts with at least 6 months of history.\n",
"\n",
"### Feature set\n",
"| Feature | Description |\n",
"|---|---|\n",
"| `flow_freq` | Proportion of months with at least one transaction |\n",
"| `gross_flow_to_aum` | Total gross flows relative to mean AUM (clipped p90, log-transformed) |\n",
"| `n_isin_total` | Total number of distinct ISINs held over the period |\n",
"| `avg_holding_months_per_isin` | Average holding duration per ISIN |\n",
"| `exit_rate_per_isin` | Average number of full exits per ISIN |\n",
"| `flow_direction_balance` | Ratio of net to gross flows (buyer vs seller signal) |\n",
"| `aum_qty_mean` | Log mean AUM — only size variable retained |\n",
"| `months_since_last_tx` | Months since last transaction (recency signal, most discriminant feature) |\n",
"\n",
"### Preprocessing\n",
"- MAD winsorization (3σ) for long-tailed distributions\n",
"- Clip p90 + log-transform for `gross_flow_to_aum` and `flow_freq`\n",
"- RobustScaler before K-means\n",
"- Geographic and allocation variables excluded from clustering (used post-hoc as descriptors)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0d8b7276-8213-4667-979c-d97b3729162a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accounts after quality filters: 7177\n",
"Accounts: 7177 | Features: 8\n",
"Points > 5 std after scaling: 0 (0.0%)\n",
" k inertia silhouette davies_bouldin\n",
" 2 20240.673342 0.421930 0.973123\n",
" 3 16711.420111 0.241169 1.543030\n",
" 4 14679.824806 0.231005 1.511161\n",
" 5 13213.816987 0.228496 1.409421\n",
" 6 12021.187284 0.223428 1.417110\n",
" 7 11112.958987 0.229601 1.420989\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAGMCAYAAAA1CuswAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAA6WpJREFUeJzs3XdcE/cbB/BPEpZMZYiAGwTZiANFHMVV96obtdZV66rVn6tDrLbSam3rtmpdtY4WpVpRq7Zu3KCouBdDkSF7J/f7gyY1AgIxEMbn/Xrxqrl87/LcQ8pdnnzvOZEgCAKIiIiIiIiIiIiIiKgAsaYDICIiIiIiIiIiIiKqqFhEJyIiIiIiIiIiIiIqAovoRERERERERERERERFYBGdiIiIiIiIiIiIiKgILKITERERERERERERERWBRXQiIiIiIiIiIiIioiKwiE5EREREREREREREVAQW0YmIiIiIiIiIiIiIisAiOhERERERERERERFREVhEJyIiIqJC7d27Fw4ODoiKiir31/b19cXcuXPL/XUrkrlz58LX11eldR0cHPDll1+qLZYLFy7AwcEBFy5cUNs2gcr9e9bk/x9EREREVL5YRCciIiKqYOTFufDwcKXlqampeO+99+Dq6opTp05pKDr1uXr1KlauXImUlBRNh1Jubt++jXnz5sHX1xeurq5o1qwZ+vbti2+//RaRkZGaDk9tnj59ii+++AKdOnWCq6srPD09MXToUGzduhVZWVnlEkNmZiZWrlyp9sI/EREREVU/WpoOgIiIiIiKl5aWhg8++AB37tzBqlWr0L59e02H9NZCQ0OxatUq9O/fH8bGxkrPHT58GCKRSEORlY09e/bA398ftWrVQu/evdG4cWPk5eXh3r17+OOPP7Bt2zZcu3YNEolE06G+lRMnTmD69OnQ0dFB3759YW9vj9zcXFy5cgVLly7F/fv3sWjRojKPIzMzE6tWrcKUKVPg5eWl9u337dsXPXv2hI6Ojtq3TUREREQVC4voRERERBVcWloaxo4di4iICKxatQodOnTQdEhlrqoVJq9evQp/f394enpi3bp1MDQ0VHp+7ty5WLt2rYaiU5/IyEjMmDED1tbW2Lp1K2rXrq14bsSIEXjy5AlOnDihuQDVICMjA/r6+pBIJJX+Cw8iIiIiKhm2cyEiIiKqwNLT0zFu3DjcvHkTK1euRMeOHd84Pi0tDV999RV8fX3h4uKCNm3aYMyYMbh586bSuGvXrmHs2LFo3rw53N3d4efnhytXrpQoppMnT2L48OHw8PBAs2bNMGHCBNy7d6/AuAcPHmD69Olo3bo13Nzc0K1bN3z//fcAgJUrV+Lbb78FAHTq1AkODg5K/aUL65UdGRmJadOmoVWrVnB3d8fgwYMLFGTlvbuDg4Oxdu1atG/fHq6urhg9ejSePHlSov0rC6tXr4ZIJMKyZcsKFNABQFdXFx9//HGxRdmMjAwEBASgQ4cOcHFxQbdu3bBp0yYIglDo+P3796Nbt25wdXXFgAEDcOnSJaXno6Oj4e/vj27dusHNzQ1eXl6YNm2ayn2+N27ciIyMDHz11VdKBXS5Bg0aYPTo0UWuv3LlSjg4OBRYXlj/8fDwcIwdOxZeXl5wc3ODr68v5s2bBwCIiopCmzZtAACrVq1SvL9WrlypWP/BgweK95M8P8ePHy/0dS9evAh/f3+0adNG8SVWYTH5+vpi4sSJuHz5sqL1UqdOnRAUFFRgn27fvg0/Pz+4ubmhffv2WLNmDQIDA9lnnYiIiKgC4kx0IiIiogoqMzMT48ePx40bN/Djjz/inXfeKXadBQsW4MiRI/Dz84OtrS2SkpJw5coVPHjwAM7OzgCAkJAQjB8/Hi4uLpgyZQpEIhH27t2L0aNH49dff4Wbm1uR2w8KCsLcuXPh4+ODWbNmITMzEzt37sTw4cOxb98+1K1bF0B+gXDEiBHQ0tLCkCFDYGNjg6dPn+Lvv//GjBkz0KVLFzx+/Bh//vkn5s2bh1q1agEATE1NC33d+Ph4DB06FJmZmRg5ciRq1aqFffv2YdKkSVixYgW6dOmiNH7Dhg0QiUT44IMPkJaWho0bN2LWrFn47bffSpR7dcrMzMT58+fRqlUr1KlTR+XtCIKASZMm4cKFC3jvvffg6OiI06dP49tvv0VsbCzmz5+vNP7SpUsIDg7GyJEjoaOjg507d2LcuHH47bffYG9vDyC/EB0aGoqePXuiTp06iI6Oxs6dOzFq1CgcPHgQNWrUKFWM//zzD+rVqwdPT0+V97MkEhISMHbsWNSqVQsTJkyAsbExoqKicPToUQD57yN/f3/4+/ujS5cuiveHvEB/7949DBs2DJaWlhg/fjz09fVx6NAhTJ48GStXrizwflq4cCFMTU0xefJkZGRkvDG2J0+eYPr06XjvvffQv39/BAYGYu7cuXB2dkaTJk0AALGxsYovEyZMmAB9fX389ttvVe4KDCIiIqKqgkV0IiIiogpq7ty5ePHiBX744Qd06tSpROucPHkSgwcPVprFPX78eMW/BUGAv78/vLy8sHHjRkXf8aFDh6Jnz5744Ycf8PPPPxe67fT0dHz11VcYNGiQUk/r/v37491338X69esVyxcvXgxBELBv3z5YW1srxs6aNQsA0LRpUzg5OeHPP/9E586dFcX3ovz000+Ij4/Hjh070KJFCwDAoEGD0KdPHyxZsgSdOnWCWPzfRZbZ2dkICgpSFCWNjY3x1Vdf4e7du4oCcnl58uQJ8vLyFAXUVyUlJUEmkykeGxoaFllIPX78OM6fP4+PP/4YkyZNApDfImXatGnYtm0b/Pz8UL9+fcX4u3fvIjAwEC4uLgCAnj174t1338WKFSuwatUqAEDHjh3x7rvvKr3OO++8gyFDhuDIkSPo169fifczLS0NsbGxJX6vvo3Q0FAkJydj06ZNcHV1VSyfMWMGAEBfXx/dunWDv78/HBwc0LdvX6X1v/rqK1hZWSEwMFCR7+HDh2PYsGFYtmxZgSK6iYkJtmzZUqL2LY8ePVJ6n3bv3h0dOnTA3r17MWfOHAD5X/IkJydj3759cHR0BAAMGDAA3bp1UzEjRERERFSW2M6FiIiIqIKKj4+Hjo4OrKysSryOsbExrl27htjY2EKfj4iIwOPHj9G7d2+8fPkSiYmJSExMREZGBtq0aYNLly4pFXVfde7cOaSkpKBnz56K9RITEyEWi+Hu7o4LFy4AABITE3Hp0iUMHDhQqYAOQOWbhZ48eRJubm6KwiQAGBgYYMiQIYiOjsb9+/eVxg8YMECpGC1fLzIyUqXXfxtpaWkA8gu7r+vcuTPatGmj+Pn777+L3M6pU6cgkUgwcuRIpeUffPABBEHAqVOnlJY3a9ZMUUAHAGtra3Tq1AlnzpyBVCoFAOjp6Smez83NxcuXL1G/fn0YGxvj1q1bKu2ngYFBqdZThZGREYD8m5jm5uaWat2kpCScP38e3bt3R1pamuJ9/PLlS/j4+ODx48cF/v8ZPHhwifuf29nZKb1PTU1N0ahRI6X33unTp+Hh4aEooANAzZo10bt371LtCxERERGVD85EJyIiIqqgvvzySyxZsgTjxo3Djh070LhxYwCAVCpFYmKi0lgTExPo6Ohg1qxZmDt3Ljp27AhnZ2d06NAB/fr1Q7169QAAjx8/BgDFjNjCpKamwsTEpMBy+bpF9bSW9/qWFwvVOeM7JiYG7u7uBZbLcxITE6P0eq8X742NjQEAKSkpRb5GTk4OkpOTVYpPW1sbNWvWLPQ5eV4KawOyZs0a5OXl4fbt2/jmm2/e+BrR0dGoXbt2gZ7qtra2iudf1aBBgwLbaNiwITIzM5GYmAgLCwtkZWVh/fr12Lt3L2JjY5V6q6empr4xntfJ40pPTy/Veqpo1aoVunXrhlWrVmHLli1o1aoVOnfujN69exfbEuXp06cQBAE//vgjfvzxx0LHJCQkwNLSUvG4uCslXlXYl14mJiZK763o6Gh4eHgUGPfqlQREREREVHGwiE5ERERUQdna2mLDhg0YPXo0PvjgA+zcuRNWVlZ49uxZgZYZ27Ztg5e
"text/plain": [
"<Figure size 1500x400 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"K=2 | sil=0.4219 | db=0.9731\n",
" n_comptes pct\n",
"cluster_k2 \n",
"1 1269 17.7\n",
"2 5908 82.3\n",
"K=4 | sil=0.2310 | db=1.5112\n",
" n_comptes pct\n",
"cluster_k4 \n",
"1 1478 20.6\n",
"2 1820 25.4\n",
"3 1171 16.3\n",
"4 2708 37.7\n"
]
}
],
"source": [
"# 2f. Engineered ratios\n",
"dfc = df_client_base.copy()\n",
"\n",
"dfc[\"gross_flow_to_aum\"] = np.where(\n",
" dfc[\"aum_qty_mean\"] > 1,\n",
" dfc[\"gross_flow_qty_sum\"] / dfc[\"aum_qty_mean\"], np.nan\n",
")\n",
"dfc[\"flow_direction_balance\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" dfc[\"net_flow_qty_sum\"] / dfc[\"gross_flow_qty_sum\"], np.nan\n",
")\n",
"dfc[\"sub_share_mean\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" dfc[\"sub_qty_sum\"] / dfc[\"gross_flow_qty_sum\"], np.nan\n",
")\n",
"dfc[\"redemption_bias\"] = np.where(\n",
" dfc[\"gross_flow_qty_sum\"] > 0,\n",
" (dfc[\"red_qty_sum\"] - dfc[\"sub_qty_sum\"]) / dfc[\"gross_flow_qty_sum\"], np.nan\n",
")\n",
"dfc[\"exit_rate_per_isin\"] = np.where(\n",
" dfc[\"n_isin_total\"] > 0,\n",
" dfc[\"full_exit_count\"] / dfc[\"n_isin_total\"], np.nan\n",
")\n",
"dfc[\"aum_final_to_peak\"] = np.where(\n",
" dfc[\"aum_qty_max\"] > 0,\n",
" dfc[\"aum_qty_last\"] / dfc[\"aum_qty_max\"], np.nan\n",
")\n",
"dfc[\"aum_drawdown_last\"] = dfc[\"aum_drawdown_last\"].clip(0, 1)\n",
"\n",
"for col in [\"aum_qty_mean\", \"gross_flow_qty_sum\", \"n_tx_total\"]:\n",
" dfc[f\"log_{col}\"] = np.log1p(dfc[col].clip(lower=0))\n",
"\n",
"# Filtres qualité\n",
"dfc = dfc[(dfc[\"n_months\"] >= 6) & (dfc[\"aum_qty_mean\"] > 0)].copy()\n",
"for col in [\"aum_qty_mean\", \"gross_flow_qty_sum\", \"n_tx_total\"]:\n",
" cap = dfc[col].quantile(0.99)\n",
" dfc = dfc[dfc[col] <= cap].copy()\n",
"\n",
"# Géographie\n",
"top_countries = dfc[\"country\"].fillna(\"Unknown\").value_counts().head(10).index\n",
"top_regions = dfc[\"region\"].fillna(\"Unknown\").value_counts().head(10).index\n",
"dfc[\"country_grp\"] = np.where(dfc[\"country\"].isin(top_countries), dfc[\"country\"], \"Other\")\n",
"dfc[\"region_grp\"] = np.where(dfc[\"region\"].isin(top_regions), dfc[\"region\"], \"Other\")\n",
"\n",
"# months_since_last_tx\n",
"dfc = add_months_since_last_tx(dfc, df_month, ID_COL)\n",
"\n",
"print(f\"Accounts after quality filters: {len(dfc)}\")\n",
"\n",
"# 5a. Feature selection & preprocessing\n",
"base_features_global = [\n",
" \"flow_freq\",\n",
" \"gross_flow_to_aum\",\n",
" \"n_isin_total\",\n",
" \"avg_holding_months_per_isin\",\n",
" \"exit_rate_per_isin\",\n",
" \"flow_direction_balance\",\n",
" \"aum_qty_mean\",\n",
" \"months_since_last_tx\",\n",
"]\n",
"all_features_global = [c for c in base_features_global if c in dfc.columns]\n",
"\n",
"dfc_clean = dfc.copy() # working copy for preprocessing\n",
"\n",
"for col in [\"flow_direction_balance\", \"months_since_last_tx\"]:\n",
" if col in dfc_clean.columns:\n",
" dfc_clean[col] = dfc_clean[col].fillna(0)\n",
"\n",
"for col in [\"n_isin_total\", \"exit_rate_per_isin\",\n",
" \"avg_holding_months_per_isin\", \"months_since_last_tx\"]:\n",
" if col in dfc_clean.columns:\n",
" dfc_clean[col] = winsorize_mad(dfc_clean[col], n_sigma=3)\n",
"\n",
"col = \"gross_flow_to_aum\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" vals = np.clip(vals, 0, np.nanpercentile(vals, 90))\n",
" dfc_clean[col] = np.log1p(vals)\n",
"\n",
"col = \"flow_freq\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" dfc_clean[col] = np.log1p(np.clip(vals, 0, None))\n",
"\n",
"col = \"aum_qty_mean\"\n",
"if col in dfc_clean.columns:\n",
" vals = dfc_clean[col].to_numpy(dtype=float)\n",
" dfc_clean[col] = winsorize_mad(\n",
" pd.Series(np.log1p(np.clip(vals, 0, None))), n_sigma=3\n",
" )\n",
" \n",
"X_global = dfc_clean[all_features_global].copy()\n",
"X_global = X_global.loc[:, ~X_global.columns.duplicated()]\n",
"X_global = X_global.fillna(X_global.median())\n",
"\n",
"scaler_global = RobustScaler()\n",
"X_global_scaled = scaler_global.fit_transform(X_global)\n",
"\n",
"# Diagnostic\n",
"X_df = pd.DataFrame(X_global_scaled, columns=X_global.columns)\n",
"extreme = (X_df.abs() > 5).any(axis=1).sum()\n",
"print(f\"Accounts: {X_global.shape[0]} | Features: {X_global.shape[1]}\")\n",
"print(f\"Points > 5 std after scaling: {extreme} ({extreme/len(X_df):.1%})\")\n",
"\n",
"# 5b. K-selection\n",
"rows = []\n",
"for k in range(2, 8):\n",
" km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)\n",
" labels = km.fit_predict(X_global_scaled)\n",
" rows.append({\n",
" \"k\": k, \"inertia\": km.inertia_,\n",
" \"silhouette\": silhouette_score(X_global_scaled, labels),\n",
" \"davies_bouldin\": davies_bouldin_score(X_global_scaled, labels),\n",
" })\n",
"df_kdiag_global = pd.DataFrame(rows)\n",
"print(df_kdiag_global.to_string(index=False))\n",
"\n",
"fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
"for ax, col, title in zip(axes,\n",
" [\"inertia\", \"silhouette\", \"davies_bouldin\"],\n",
" [\"Elbow / Inertia\", \"Silhouette (higher=better)\", \"Davies-Bouldin (lower=better)\"]):\n",
" ax.plot(df_kdiag_global[\"k\"], df_kdiag_global[col], marker=\"o\")\n",
" ax.set_title(title); ax.set_xlabel(\"K\")\n",
"plt.suptitle(\"K-selection — Global Clustering\")\n",
"plt.tight_layout()\n",
"plt.savefig(\"k_selection_global.png\", dpi=300, bbox_inches=\"tight\")\n",
"plt.show()\n",
"\n",
"# 5c. Final clustering K=4\n",
"RESULTS_GLOBAL = {}\n",
"for k in [2, 4]:\n",
" km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)\n",
" dfc[f\"cluster_k{k}\"] = km.fit_predict(X_global_scaled) + 1\n",
" RESULTS_GLOBAL[k] = {\n",
" \"model\": km,\n",
" \"silhouette\": silhouette_score(X_global_scaled, dfc[f\"cluster_k{k}\"]),\n",
" \"davies_bouldin\": davies_bouldin_score(X_global_scaled, dfc[f\"cluster_k{k}\"]),\n",
" }\n",
" print(f\"K={k} | sil={RESULTS_GLOBAL[k]['silhouette']:.4f} \"\n",
" f\"| db={RESULTS_GLOBAL[k]['davies_bouldin']:.4f}\")\n",
" counts = dfc[f\"cluster_k{k}\"].value_counts().sort_index()\n",
" props = counts / counts.sum() * 100\n",
" print(pd.DataFrame({\"n_comptes\": counts, \"pct\": props.round(1)}))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8188f322-c38b-4f1c-b67f-899b5fb71482",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"================================================================================\n",
"PREPROCESSING DIAGNOSTIC — Global clustering (K=4, 8 features)\n",
"================================================================================\n",
"\n",
"--- Check 1: Extreme values after scaling (threshold: 5 std) ---\n",
" n_extreme pct_extreme STATUS\n",
"flow_freq 0 0.0 OK\n",
"gross_flow_to_aum 0 0.0 OK\n",
"n_isin_total 0 0.0 OK\n",
"avg_holding_months_per_isin 0 0.0 OK\n",
"exit_rate_per_isin 0 0.0 OK\n",
"flow_direction_balance 0 0.0 OK\n",
"aum_qty_mean 0 0.0 OK\n",
"months_since_last_tx 0 0.0 OK\n",
"\n",
"→ 0 feature(s) above 1% threshold\n",
"\n",
"--- Check 2 + 3: Discriminant power (η²) and ANOVA significance ---\n",
" A feature passes if η² > 0.05 OR ANOVA p < 0.05\n",
" feature eta2 F_stat p_value note STATUS\n",
" flow_freq 0.8126 10368.87 0.00000 OK\n",
"avg_holding_months_per_isin 0.5713 3185.95 0.00000 OK\n",
" months_since_last_tx 0.4229 1751.79 0.00000 OK\n",
" flow_direction_balance 0.3431 1062.83 0.00000 OK\n",
" n_isin_total 0.2364 740.07 0.00000 OK\n",
" exit_rate_per_isin 0.1653 473.65 0.00000 OK\n",
" aum_qty_mean 0.0797 207.17 0.00000 OK\n",
" gross_flow_to_aum 0.0022 5.01 0.00181 low η² but significant ANOVA — retained OK\n",
"\n",
"→ 0 feature(s) failing both η² and ANOVA criteria\n"
]
}
],
"source": [
"# PREPROCESSING DIAGNOSTIC \n",
"def feature_diagnostic_full(dfc, features, X_scaled, feature_names_scaled,\n",
" cluster_col=None, label=\"\"):\n",
" \"\"\"\n",
" Full preprocessing diagnostic:\n",
" - Check 1: extreme values after scaling (> 5 std)\n",
" - Check 2: variance ratio eta² per feature\n",
" - Check 3: one-way ANOVA F-statistic per feature\n",
" \n",
" A feature passes if:\n",
" - Check 1: pct extreme < 1%\n",
" - Check 2 + 3 combined: eta² > 0.05 OR ANOVA p < 0.05\n",
" (a feature with low eta² but significant ANOVA is retained —\n",
" it may have modest effect size but genuine discriminant power)\n",
" \"\"\"\n",
" print(f\"\\n{'='*80}\")\n",
" print(f\"PREPROCESSING DIAGNOSTIC — {label}\")\n",
" print(f\"{'='*80}\")\n",
"\n",
" # ── Check 1 — Extreme values after scaling ────────────────────────────\n",
" print(\"\\n--- Check 1: Extreme values after scaling (threshold: 5 std) ---\")\n",
" X_df = pd.DataFrame(X_scaled, columns=feature_names_scaled)\n",
" extreme_by_feat = (X_df.abs() > 5).sum().sort_values(ascending=False)\n",
" extreme_pct = extreme_by_feat / len(X_df) * 100\n",
"\n",
" check1 = pd.DataFrame({\n",
" \"n_extreme\": extreme_by_feat,\n",
" \"pct_extreme\": extreme_pct.round(2),\n",
" \"STATUS\": [\"⚠ FAIL\" if p > 1 else \"OK\" for p in extreme_pct]\n",
" })\n",
" print(check1.to_string())\n",
" n_fail_1 = (check1[\"STATUS\"] == \"⚠ FAIL\").sum()\n",
" print(f\"\\n→ {n_fail_1} feature(s) above 1% threshold\")\n",
"\n",
" # ── Check 2 + 3 — η² and ANOVA (combined) ────────────────────────────\n",
" print(\"\\n--- Check 2 + 3: Discriminant power (η²) and ANOVA significance ---\")\n",
" print(\" A feature passes if η² > 0.05 OR ANOVA p < 0.05\")\n",
" \n",
" if cluster_col and cluster_col in dfc.columns:\n",
" rows = []\n",
" for col in features:\n",
" if col not in dfc.columns:\n",
" continue\n",
" vals = pd.to_numeric(dfc[col], errors=\"coerce\").to_numpy(dtype=float)\n",
" labels = dfc[cluster_col].to_numpy()\n",
" finite = np.isfinite(vals)\n",
" if finite.sum() < 10:\n",
" continue\n",
" vals_f = vals[finite]\n",
" labels_f = labels[finite]\n",
"\n",
" # η²\n",
" grand_mean = np.mean(vals_f)\n",
" ss_total = np.sum((vals_f - grand_mean) ** 2)\n",
" ss_between = sum(\n",
" np.sum(labels_f == k) *\n",
" (np.mean(vals_f[labels_f == k]) - grand_mean) ** 2\n",
" for k in np.unique(labels_f)\n",
" )\n",
" eta2 = ss_between / (ss_total + 1e-9)\n",
"\n",
" # ANOVA\n",
" groups = [vals_f[labels_f == k] for k in np.unique(labels_f)\n",
" if (labels_f == k).sum() > 1]\n",
" f_stat, p_val = stats.f_oneway(*groups) if len(groups) >= 2 \\\n",
" else (np.nan, np.nan)\n",
"\n",
" # Combined status: fail only if BOTH criteria fail\n",
" fail = (eta2 < 0.05) and (np.isnan(p_val) or p_val > 0.05)\n",
" note = \"\"\n",
" if eta2 < 0.05 and not fail:\n",
" note = \"low η² but significant ANOVA — retained\"\n",
"\n",
" rows.append({\n",
" \"feature\": col,\n",
" \"eta2\": round(eta2, 4),\n",
" \"F_stat\": round(f_stat, 2) if not np.isnan(f_stat) else np.nan,\n",
" \"p_value\": round(p_val, 6) if not np.isnan(p_val) else np.nan,\n",
" \"note\": note,\n",
" \"STATUS\": \"⚠ FAIL\" if fail else \"OK\",\n",
" })\n",
"\n",
" df_diag = pd.DataFrame(rows).sort_values(\"eta2\", ascending=False)\n",
" print(df_diag.to_string(index=False))\n",
" n_fail_23 = (df_diag[\"STATUS\"] == \"⚠ FAIL\").sum()\n",
" print(f\"\\n→ {n_fail_23} feature(s) failing both η² and ANOVA criteria\")\n",
"\n",
" else:\n",
" print(\"No cluster column provided — skipping.\")\n",
" df_diag = None\n",
" n_fail_23 = 0\n",
"\n",
" return check1, df_diag\n",
"\n",
"\n",
"# ── Run diagnostic on global clustering features ──────────────────────────\n",
"check1_g, diag_g = feature_diagnostic_full(\n",
" dfc = dfc,\n",
" features = base_features_global,\n",
" X_scaled = X_global_scaled,\n",
" feature_names_scaled = list(X_global.columns),\n",
" cluster_col = \"cluster_k4\",\n",
" label = \"Global clustering (K=4, 8 features)\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1c0ea35a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABY4AAAGGCAYAAADcjhWTAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3XVcVNn7wPHPkBLSJoqICgaiYCt269q92L12r7pru3bHYqyt2B1rYH5t7MLCWDEpaaTm9wfLrOOAIhLi73m/XvPanXPPPfc5M3fG4Zkzz1UolUolQgghhBBCCCGEEEIIIcS/tDI7ACGEEEIIIYQQQgghhBDfF0kcCyGEEEIIIYQQQgghhFAjiWMhhBBCCCGEEEIIIYQQaiRxLIQQQgghhBBCCCGEEEKNJI6FEEIIIYQQQgghhBBCqJHEsRBCCCGEEEIIIYQQQgg1kjgWQgghhBBCCCGEEEIIoUYSx0IIIYQQQgghhBBCCCHUSOJYCCGEEEIIIYQQQgghhBpJHAshhBAiw3Tq1IlatWpldhjfDV9fXxwcHFi8eHFmhyIyybe+JtLzNeXg4MDo0aPTZezRo0fj4OCQLmNntO/tdezh4YGLiwtBQUGZHcoXeXp64ujoyLNnzzI7FCGEEEIkQRLHQgghhPgmkZGRrF27lp9//pny5ctTokQJKleuTK9evdi1axexsbEZHtPixYvx9PTM8OMm8vLyom/fvtSqVQtHR0cqVapEy5YtmTp1Ki9evMi0uL6Gr68vixcvxtvbO7NDyVL++ecf/vjjD3766SdcXFxwdHTE1dWVXr164eHhQURERGaHmG5iY2PZsWMH3bp1o2LFijg6OlKhQgU6derEhg0biIyMzPCY1q5dy65duzL8uJklNDSUxYsX07VrV8zNzVXtixcvxsHBgdu3b2vss2bNGooWLUr79u0JCQlJ85g8PDxwcHDAwcGBwMBAtW116tTB3t6eOXPmpPlxhRBCCPHtdDI7ACGEEEJkXc+fP6d37948e/aMypUr07t3b8zNzQkICODChQuMGTOGx48fM2rUqAyNa8mSJbRo0YI6depk6HEhIUkyadIk8ufPT/PmzcmTJw+BgYH4+Phw4MABypYtS/78+QGwtrbm1q1baGtrZ3icX/Ly5UuWLFmCtbU1xYoVy+xwsoRdu3YxYcIEdHR0aNCgAe3btydbtmz4+/vj5eXFlClTOH78OKtWrcrsUNNcYGAgv/zyCzdu3KBUqVJ06dKFHDlyEBoaipeXF9OnT+fKlSssXLgwQ+Nav3491tbWtGzZMt2O8T29jj08PAgNDaVjx44p6r9gwQLc3d1xdXVlyZIlGBgYpGk8b9++Ze7cuRgaGib7pUnnzp359ddfefToEUWKFEnT4wshhBDi20jiWAghhBCpEhUVRZ8+fVQrU+vVq6e2vXfv3ty6dSvJFW5ZWVhYGMbGxklui42NZf78+eTNm5c9e/Zo9IuOjlZLnigUCvT19dM13u/V5x7HrOjChQv89ttvFClShJUrV5IrVy617X379uXFixccOnQokyJMP0qlkkGDBnHjxg1+//13OnXqpLa9W7duPHv2jMOHD2dShOkj8Rz+Xl7H8fHxbN26lapVq2JhYfHZvkqlkqlTp7Jx40YaNmzIrFmz0NPTS/OYJk+ejI2NDYULF2bfvn1J9qlbty4TJ05ky5YtjBs3Ls1jEEIIIUTqSakKIYQQQqTK9u3befr0Kd26ddNIGidycnLCzc3ts+PUqlVLI9EEcOnSJRwcHNR+Zv7hwwcWL15M/fr1KVWqFGXLlqVJkybMnDkT+K/WKMDu3btVP4/+tJbq+fPn6d69O2XLlqVkyZI0adKEzZs3JxvbvXv36NGjB2XKlKFp06bJziUoKIiQkBBKliyZZFJUT08PMzMz1f3kaqNGRkYyffp0XF1dcXJyom3btly4cCHJurCJNW7fvn3LsGHDKFeuHKVKlaJHjx48ffpUrW9YWBjz58+nTZs2VKhQAUdHR+rWrcucOXPUygjs2rWLzp07AzBmzBjVY5j4PO3atQsHBwcuXbqkMcekau5+6XF89uwZI0eOxNXVFUdHR2rVqsXMmTM1Vii+fv2aMWPGULNmTVUJkPbt27N7926NODLa7NmzgYQVnJ8mjRPlz5+fPn36pGg8Ly8vunXrRpkyZXBycqJFixZs37492f4vXrzgl19+oUyZMri4uNC/f3+Nsijx8fG4u7vj5uZGlSpVcHR0pEaNGkyYMOGb6uGePHkSLy8vGjVqlORrGcDW1pa+fft+dpzk6jUn9TqJj49n7dq1NGnSBGdnZ1xcXKhfvz5jx44lJiYGSKjR/PLlSy5fvqz2XuDr66sa5/bt2/Tv31/1eqhfvz7u7u4aJXYSY3vx4gWDBg2ifPnylClTJtn4Pm47efIkrVq1omTJkri6ujJz5swkS/gcOXKEpk2bUrJkSWrUqMGSJUs4f/68xvtgcm7dusXLly+pXr36Z/vFxsYyatQoNm7cSNu2bZk3b166JI2PHTvGiRMnmDRp0mdXYxsZGVGmTBmOHDmS5jEIIYQQ4tvIimMhhBBCpEriH/nt2rXLsGNOmjSJnTt30rx5c5ydnYmLi+PZs2eqBKaFhQWzZs1i1KhRlC1blrZt22qMsXXrViZMmEDp0qXp27cvBgYGnD9/nokTJ/LPP//w66+/qvV/9eoVXbp0oUGDBtSrV++zNWqtrKwwNDTEy8uLJ0+eYGdnl6p5Dh48mNOnT1OnTh0qV66Mr68v/fv3J1++fEn2j4iIoGPHjpQqVYqhQ4fi6+vL+vXr6devHwcOHFAlbd6+fcuOHTuoV68eP/30Ezo6Oly+fJm//voLb29vVQmFcuXK0bdvX5YtW0a7du1UCTIrK6tUzQeSfxzv3LlDly5dMDExoV27duTKlYv79++zYcMGrl+/zoYNG9DV1SU2NpZu3brx9u1bfv75Z2xtbQkLC+PBgwdcuXKFFi1apDq2b/XixQvu3r1LuXLlUv2cf+zEiRMMGDAAKysrunXrhrGxMQcPHuT333/H19eXoUOHqvWPiIigU6dOODk5MWzYMJ4/f46Hhwc3b95k9+7d5MiRA4CYmBhWrVpFvXr1qF27NgYGBty+fZudO3dy7do1du7cmaoEYuJ7QVKvt/Ti7u7OokWLqFmzJu3bt0dbWxtfX19OnDhBdHQ0urq6zJo1i+nTp2Nubq6WtE5cjXvq1CkGDBhAgQIF6N69O6ampty4cYNFixbh7e3NokWL1I4ZHh5Ox44dcXFxYciQIRr1epNy+vRpPDw8aN++Pa1ateL48eOsXr0aU1NTtZgOHTrEsGHDsLGxYcCAAWhra7Nnzx5OnDiR4sfk8uXLQMIXdsn58OEDAwcO5MSJE/Ts2ZORI0cm2S88PJwPHz6k6Lj6+voYGRmptYWFhTF58mTatWuHk5MTHh4enx3D2dmZs2fP4uPjQ6FChVJ0XCGEEEKkP0kcCyGEECJVHj16hLGxsapeb0bw9PSkWrVqqhXGnzI0NKRZs2aMGjWK/Pnz06xZM7Xt7969Y+rUqTRu3Ji5c+eq2t3c3Jg6darqIn8fz8nX15epU6fSpk2bL8anUCgYOHAgM2fO5KeffqJ48eKULl0aJycnKlWqpErgfc7p06c5ffo0bdq0YerUqar2ihUr0rt37yT3CQoKokePHvTq1UvVZmFhwezZszl//jxVq1YFEla8njp1Cl1dXbW5J9Y5vXXrFk5OTuTPn5/KlSuzbNkySpcurfE4pkZyj+PYsWPJkSMHO3bsUFulXalSJQYMGMD+/ftp2bIljx8/5unTp4wYMUJtnt+DR48eAVC0aFGNbZGRkRoXhTM3N0ehUCQ5VlxcHFOmTMHQ0JDt27erVi///PPPdO7cmRUrVtCiRQtsbW1V+wQFBdG5c2d+++03VVu5cuUYMGAAixcvZvLkyUDCivezZ8+SLVs2Vb8OHTrg7OzM77//jqenJ40aNUr1/DOyFranpyeFChVi2bJlau0jRoxQ/X+zZs1YuHAhVlZWGufwhw8f+O233yhVqhTr1q1DRyfhz6L27dtTtGh
"text/plain": [
"<Figure size 1600x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 5d. Cluster profiles\n",
"profile_vars_behavior = [\n",
" \"gross_flow_to_aum\", \"flow_freq\", \"flow_direction_balance\",\n",
" \"n_isin_total\", \"avg_holding_months_per_isin\", \"exit_rate_per_isin\",\n",
" \"aum_qty_mean\", \"months_since_last_tx\",\n",
"]\n",
"profile_vars_behavior = [c for c in profile_vars_behavior if c in dfc.columns]\n",
"\n",
"prof_global = plot_heatmap_annotated(\n",
" dfc,\n",
" profile_vars = profile_vars_behavior,\n",
" cluster_col = \"cluster_k4\",\n",
" title = \"Cluster Signatures — Global Clustering (K=4)\\n\"\n",
" \"Color: robust z-score | Values: cluster medians\",\n",
" figsize = (16, 4)\n",
")"
]
},
{
"cell_type": "markdown",
"id": "9fb2786e",
"metadata": {},
"source": [
"---\n",
"### 5e. Cross-Analysis"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f276e2b4-2290-449e-ad67-1d5953073ec5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved: Figures/donut_geography_global.png\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABv0AAAIgCAYAAACmv6VnAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xd8U9X7B/BP0r0HXXQzW0bZW1RAkKGoLAUHiKiooCgqAl/3YigqoiJ7yt6UIUP2KlCgdEHp3rtNd9b9/RGSH6GDFtqmST7v16sv7c0dzy1pT+55znmOSBAEAURERERERERERERERESkt8S6DoCIiIiIiIiIiIiIiIiIHg2TfkRERERERERERERERER6jkk/IiIiIiIiIiIiIiIiIj3HpB8RERERERERERERERGRnmPSj4iIiIiIiIiIiIiIiEjPMelHREREREREREREREREpOeY9CMiIiIiIiIiIiIiIiLSc0z6EREREREREREREREREek5Jv2IiIiIiIiIiIiIiIiI9ByTfkRERERkUAYNGoTXXnutVvvu2rULAQEBuHTpUgNH9WCvvfYaBg0apPNr6iIOXV73XrNnz0ZAQIBOY6hJSkoKAgICsGTJEl2HYnQuXbqEgIAA7Nq1SyfH12TJkiUICAhASkpKvZ/b0N5zTel3vKKiAoMGDcKvv/6q61AeSBAEjBo1CnPmzNF1KERERET0AKa6DoCIiIiIalZRUYEdO3bg33//xe3bt1FUVAQrKyv4+fmhT58+GD16NFq1aqXrMOmu2bNnY/fu3Zrvzc3NYWdnhxYtWqBHjx4YO3YsfHx86u16UVFROHbsGEaNGgVvb+96O29DWLt2Lezt7TF69Ghdh9KkxMfHY926dbh48SIyMjKgVCrRvHlz9OrVC+PGjUOnTp0aNR59ek89LIVCgeDgYAQHByMyMhKFhYUwMzODj48Pevbsieeff77Rf+6Nie853VuzZg0kEgneeOMNre0BAQEYMGAAli1bprU9MzMTb7zxBhISEjBv3jw899xz9RpPWVkZnn32WaSkpOCVV17Bl19+qXlNJBJh+vTpmD59OiZOnIh27drV67WJiIiIqP4w6UdERETUhCUnJ2Pq1KmIjY1Fr1698Prrr8PV1RWlpaWIiorCzp07sXr1apw8eRLu7u66DlfvPP/883jmmWdgZmZW7+f++uuvYW1tDYVCgfz8fISFhWH16tVYtWoVPv74Y0yePFlr/1WrVj3UdaKiovDHH3+gV69ede4sf9hrPqz169fDy8uryqRfY8fSVGzfvh3ffPMNzM3N8eyzzyIwMBCmpqaIj4/HkSNHsG3bNhw4cACtW7dutJge5T2lD/Ly8jBt2jSEhoaiQ4cOmDBhAjw8PFBRUYHY2FgcP34cGzduxJYtW9C1a1ddh1vvjPk999133+Gbb75psPPXVnl5OVatWoXRo0fDwcHhgfsnJiZi8uTJyM3NxZ9//okBAwbUe0y///478vLyqn39qaeegpeXF5YuXYrff/+93q9PRERERPWDST8iIiKiJqq8vBxvv/02kpOT8ccff2DIkCGV9qmoqMDatWsbP7iHVFxcDFtbW12HoWFiYgITE5MGOffQoUPh7OystS0tLQ1Tp07F/Pnz4e7ujhEjRmheMzc3b5A47qdQKCCVSmFlZdVo16yNphRLYzl//jy+/PJLtG7dGitXrqyUuP/444+xceNGHUXXMARBQGlpKWxsbHR2/RkzZiA0NBRffPEFXn311Ur7zJ07F7t27TLI96Sxv+caYoDHw9i/fz8kEgleeOGFB+5769YtTJkyBWVlZVi5ciV69uxZ7/FERERg3bp1+PTTTzF//vxq93vuueewfPlyZGdnw9XVtd7jICIiIqJHxzX9iIiIiJqo7du3Iy4uDlOmTKky4QcAFhYWmDp1aqWO26KiIvz0008YMmQIOnbsiD59+mDmzJlITk6udI68vDx88803ePLJJ9GxY0c8+eST+Oabb5Cfn19p35SUFLz//vvo1q0bunXrhnfffRfJyclVrqMXEBCA2bNn48KFC5gwYQK6du2Kd999F4CqTNn8+fPx/PPPo2fPnggKCsKIESOwfPlyKBQKrfOo1907f/48lixZgoEDB6Jjx44YOXIkDhw4UO3PLzY2Fm+//Ta6du2K7t2744MPPkB2dnaV575/TT+pVIoVK1bg+eefR+fOndG9e3eMHj36kTvDPT098fvvv0MsFldax6mqNe1iYmLwwQcf4PHHH0fHjh3x2GOP4bXXXsPJkycBqNbyUq+xNHHiRAQEBGh+7vfe3/nz5/Hnn39i8ODB6NSpEw4dOlTtNdWSk5Px7rvvonv37ujWrRumTZtW6f1T05qI9587ICAAqampCAkJ0cR57zpk1cVy+fJlTJ48Gd27d0enTp0watQobN++vdrrZWZmYubMmejZsyc6d+6MKVOmID4+vsp7rE5eXh5mzZqF3r17o0uXLpg0aRIiIiI0r+fm5qJjx474+OOPqzz+m2++QWBg4APXWPv5558hCAJ+/fXXKmfqmpqa4vXXX69xxlVNa8VVtX7Zo76nANXvx99//41nnnkGQUFB6NGjB9555x1ERkZWG9s///yDESNGICgoCKtXr67x59KQTpw4gZCQEDz77LNVJvwA1c/9xRdfRIcOHR54vtLSUixatAiDBw/W/DxnzZqF1NTUao/ZsGEDhg4diqCgIAwdOhQbNmyotE9YWBhmz56NoUOHonPnzujatSvGjx+Po0eP1v5mq2Ds77mq4lNvKyoqwldffYW+ffsiKCgI48ePx40bNyrdY35+PubMmYPevXuja9eumDhxIiIjI+u0Lunhw4fh6uqK9u3b17jftWvX8Nprr0GhUGD9+vUNkvBTKBT44osv8Pjjj1f7WUPtiSeegEwmw7Fjx+o9DiIiIiKqH5zpR0RERNRE/fvvvwCAsWPH1um4oqIijB8/HmlpaRgzZgzatGmD7OxsbNq0CePGjcPOnTvh5eWl2XfChAlITEzEmDFj0L59e0RFRWHz5s24ePEitm/frpmZl5+fj1deeQW5ubkYP348WrZsiatXr2LSpEkoLS2tMpbw8HD8+++/ePHFFzFq1CjN9lu3buHIkSMYMmQIfH19IZPJcObMGSxatAgpKSn49ttvK53r559/RmlpKSZMmABAlXCaOXMmKioqKpWLzMzMxMSJEzF48GDMmjUL0dHR2Lp1K4qLix+YcJBKpZgyZQpCQkLQv39/PPfcc7CwsMDt27dx5MiRahMFtaVe2y8kJARxcXFo2bJllfvl5+dj0qRJAIDx48fD09MT+fn5CA8Px40bNzBgwAAMGTIE2dnZ2Lp1K9555x3NuXx9fbXOtWDBAsjlcrz44ouwsbFBixYtaoyxtLQUr732Gjp16oSZM2ciMTERmzZtwo0bN7B79+6HmuGxcOFCzJs3D05OTnjnnXc02++fDXmv//77D9OnT4eLiwsmT54MW1tbHDhwAJ9//jlSUlLw0UcfVYr71VdfRefOnfHRRx8hJSUF69evx3vvvYfg4OBaz+p888034eDggOnTpyMnJwcbN27Eq6++iq1bt6Jt27Zo1qwZBg0ahKNHj0IikcDe3l5zbEVFBYKDg9GvX78ayxQmJycjIiICPXr0aLQyivXxnpLJZJgyZQquXbuG559/Hq+88gqKi4uxbds2TJgwARs3bkRQUJDWddetW4eCggKMGzcOrq6u8PDwaJT7rYr67+q4ceMe+Vzqn0VoaCiGDh2KyZMnIzExEZs3b8a5c+ewc+fOSve6ceNGZGdn46WXXoKtrS2Cg4Px/fffo7CwENOnT9fsd/ToUcTFxWHYsGHw8vJCQUEBdu/ejenTp+Pnn3/GyJEj6xwv33M1mzJlCpydnTFt2jQUFBRgzZo1ePvtt3H8+HFNOyiVSjF58mRERUVh9OjRCAoKwq1btzB58uRalekEVEm20NBQ9OnTp8b9zp07h+nTp8PBwQGrV6+usq1QKpUoKCio1XUBwNHREWKx9tjvtWvXIi4urlYlO9u3bw9zc3OEhIRo2mIiIiIialqY9CM
"text/plain": [
"<Figure size 1800x500 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved: Figures/donut_asset_global.png\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABv0AAAIgCAYAAACmv6VnAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xd4U+XbB/DvSZpuyqZA2cOylyIIKMgQFQVkKwgoIIqDnygIvG5BREFkKFsQEJC9p2xKobSMlpYWaEsn3XtmnfePkEhswbYkORnfz3X1Ak5OzrlT0j45536e+xZEURRBRERERERERERERERERDZLJnUARERERERERERERERERPR4mPQjIiIiIiIiIiIiIiIisnFM+hERERERERERERERERHZOCb9iIiIiIiIiIiIiIiIiGwck35ERERERERERERERERENo5JPyIiIiIiIiIiIiIiIiIbx6QfERERERERERERERERkY1j0o+IiIiIiIiIiIiIiIjIxjHpR0RERERERERERERERGTjmPQjIiIiIovr3bs33nzzTanDIDtn7e+zXbt2wdfXF5cuXZI6FIezdOlS+Pr6Ij4+XpLnP8qbb76J3r17m/y4gP2956zpZ/zWrVto1aoV/Pz8pA7lP6WmpqJ9+/bYvXu31KEQERERkYk5SR0AEREREZnGpUuXMHbsWMO/ZTIZPD094e3tjdatW2PAgAF49tlnIQiChFFav6VLl6Jly5bo27dvuZ+r0WjQq1cvpKSk4KOPPsL7779vhggrrjyvrXfv3khISCjTcTds2IAuXbo8bnh2w9/fH1u3bsW1a9eQnp4OhUKBxo0b49lnn8Xrr7+O2rVrWzSev//+Gzdv3sSHH35o0fNaUm5uLrZs2YKTJ08iOjoaeXl58PT0RJMmTdC9e3cMGzbM4t93S+J7Tno//PADOnXqhO7duxu26cflGTNmYMKECUb7BwQE4L333oObmxvWrFmDFi1amDSe8PBwDB06FGq1GosXL8aLL75oeKxmzZoYNWoUFi1ahBdffBFubm4mPTcRERERSYdJPyIiIiI788orr+C5556DKIrIz89HdHQ0Tpw4gT179qBbt25YvHgxvLy8JI3xyJEjkp7/UZYtW4bXXnutQkm/s2fPIiUlBQ0aNMDu3bsxZcoUq0qylue1zZ49G/n5+YZ/R0VFYcWKFejXrx/69etntG/Tpk1NHqst0mq1+PLLL7F9+3b4+PjglVdeQaNGjaBUKhEaGoo///wT27Ztg7+/v0Xj+vvvv7F79267TcDcuHEDU6ZMQUpKCnr27Il33nkHVapUQW5uLkJCQrB27VqsWLECN27ckDpUk3P095y1jCVXr16Fn58ffv311zLtf+rUKUydOhU1atTA+vXr0aBBA5PGo9Vq8cUXX8DZ2RlqtbrUfd5880388ccf2LVrF0aPHm3S8xMRERGRdJj0IyIiIrIzrVq1wqBBg4y2zZo1Cz/99BPWrVuHadOmYc2aNRJFp+Ps7Czp+c1lx44daNCgAWbOnIkpU6bg0qVL6Nq1q9RhVci/E4OXLl3CihUr4OvrW+L9RTpLly7F9u3b8corr2DevHkl3uczZ87EsmXLJIrOPFQqFbRaLVxcXCQ5f1paGiZPnozi4mJs2rQJTz31VIl9cnNzy5yMsTWO/p6zlrFk8+bNqFq1Knr27Pmf++7fvx8zZ85E48aNsXbtWnh7e5s8no0bN+LOnTuYMGECli5dWuo+9erVw1NPPYWtW7cy6UdERERkR9jTj4iIiMgByOVyzJw5E08++STOnTuHwMBAo8fj4+Mxffp0dOvWDW3atEHfvn3x888/o7Cw0Gg/fR+rO3fuYO7cuejRowfat2+PcePGISoqCgBw7NgxvPbaa2jXrh169+6Nv/76q0Q8pfVh0m+LjIzEO++8g44dO+LJJ5/ERx99hNTUVKN9k5OT8cMPP2DQoEHo3Lkz2rZti5dffhmrVq2CRqMx2lffw8rf3x9r165F37590aZNG/Tv39+on1F8fDx8fX0BALt374avr6/hqyzS0tJw+vRpDBo0CD179kT16tWxY8eOUve9cuUKJk6ciO7du6Nt27Z49tlnMWnSJFy7ds2wT1ZWFr7//nv07dsXbdu2RZcuXTBkyJBSE7aHDh3C66+/jo4dO6J9+/YYPny40QqYx31tDzNw4ED06tULWq22xGOHDx+Gr68v9uzZA0CXNPT19cWuXbuwceNG9O/fH23btkX//v2xcePGUo9/9+5dTJ8+HT169ECbNm3Qu3dvzJ8/HwUFBeWKMzQ0FGPHjkXHjh3x9NNP47PPPkN6errh8ePHj8PX1xfbtm0r9fkDBgxAv379IIriQ8+Rnp6OtWvXwsfHB99//32pyQgvLy/Mnj37kbE+qldcaT83p0+fxpgxY9ClSxe0a9cOvXr1wgcffIDo6GgAutU8+vf5g//vu3btMhwjJSUFX331FXr16oU2bdqgR48e+OKLL4y+Rw/Gdvv2bcybNw/PPfcc2rVrZ/S+tbQ1a9YgLS0NM2bMKDXhBwCVKlXCzJkzy3S8sv4u1CssLMScOXPQvXt3tGvXDsOHDy91Vd2hQ4fw7rvvGr7HXbp0wZQpUxAeHl72F/svfM89/lgC6Mpgvv322+jQoQO6dOmCzz77DBkZGfD19S3T+0atVuPvv/9Gt27doFAoHrnv5s2bMX36dLRq1QqbNm0yS8Lv3r17+OWXX/DBBx+gbt26j9z3ueeew61btxAZGWnyOIiIiIhIGlzpR0RERORAhg0bhqCgIJw5c8ZwgzwhIQHDhw9Hbm4u3njjDTRs2BABAQFYuXIlrly5gvXr18PJyfhj42effQZ3d3dMnjwZGRkZWLduHSZOnIiPPvoICxYswKhRozB06FDs2LEDX375JZo2bfrQG/IPSk5OxtixY9G3b1/MmDED4eHh+Ouvv5CXl4fff//dsF9ERASOHTuGfv36oUGDBlCpVDh37hwWLlyI+Ph4fPvttyWOvWjRIhQVFWHkyJFwdnbGli1bMHPmTDRo0ABPPvkkqlWrhh9//NGQPBgxYkS5vrd79uyBRqPB4MGD4eTkhFdffRVbt25Fbm4uKlWqZNgvKioKb7/9NmrUqIGxY8eievXqSE9PR1BQEMLDw9GhQwcAwNSpUxEYGIhRo0bB19cXRUVFiIyMREBAACZOnGj0ulasWIFnn30WU6dOhUwmw/HjxzF16lR8+eWXGD169GO/tocZMWIEvvvuO/j5+eHZZ581emzHjh2oVKmSUR8pANi0aRNSU1MxcuRIeHp64sCBA5gzZw6ys7PxwQcfGPa7ceMGxo0bBy8vL4wcORLe3t4IDw/Hxo0bcfXqVWzcuPE/b7ADQFJSEsaPH48XXngB/fv3R1hYGHbu3IkbN25gx44dcHNzw/PPP4+aNWti586dJb43165dw507d/Dxxx8/slTr6dOnUVxcjEGDBlls1Zu+J1jz5s0xefJkVKpUCSkpKfD390dsbCwaN26Md999F1qtFoGBgfjxxx8Nz+3UqRMAIDExESNHjoRKpcKwYcPQoEEDxMTEYMuWLbh06RJ27txp9P4FgE8//RSurq54++23Aej6g0nl2LFjcHZ2Nsnq04r+LpTJZJg0aRLy8vLw119/YeLEiVi9ejW6detm2G/Tpk2oUqUKRowYgZo1ayI2Nhbbtm3D66+/jt27d6NRo0bljpfvuYcr61hy9+5djB49GlqtFm+++Sa8vb1x5swZo9+x/yU0NBQFBQVo167dI/dbuXIlfv75Z3Tt2hW//fYbPDw8SuyjVCqRl5dXpvPK5XJUrly5xPavv/4a9evXx7hx47Bv375HHkM/3gQEBLBMMxEREZGdYNKPiIiIyIHoV3bdvXvXsO3nn39GRkYGVq1aZShNNnr0aMyfPx+///47du/ejeHDhxsdp2bNmli+fLkhCVK1alXMnTsX3377LQ4ePIg6deoAAF5++WX07NkTmzdvLlPSLyYmBosWLcLLL79s2CaTybB582ZERUWhSZMmAICnn34aJ06cMErCjB8/HtOnT8f27dvxwQcfoFa
"text/plain": [
"<Figure size 1800x500 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved: Figures/donut_fund_global.png\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABv0AAAIgCAYAAACmv6VnAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xd8VFX6x/HPpDfSSOi9GHpdmqKrgA17QbFhWSyrrq7YkPVnb7g2BEVBBEFRpCkdpUjvvZcAIQXSe5t2f3/EmSUmQAJJJpN8369XXpA7d+59bjKZM/c85zzHZBiGgYiIiIiIiIiIiIiIiIi4LQ9XByAiIiIiIiIiIiIiIiIiF0dJPxERERERERERERERERE3p6SfiIiIiIiIiIiIiIiIiJtT0k9ERERERERERERERETEzSnpJyIiIiIiIiIiIiIiIuLmlPQTERERERERERERERERcXNK+omIiIiIiIiIiIiIiIi4OSX9RERERERERERERERERNyckn4iIiIiIiIiIiIiIiIibk5JPxERERGp0QYMGMADDzzg6jBqvAceeIABAwacd7+4uDiioqIYO3ZsFUR1YUaOHElUVJSrw6iVyvo6qqznn0tUVBQjR46slGPXpNdcdfsbnz59Oj169CA9Pd3VoZzXsmXL6NSpEydOnHB1KCIiIiLiprxcHYCIiIiIuJ9NmzYxbNgw5/ceHh4EBQVRv359OnbsyA033MDll1+OyWRyYZTV39ixY2nfvj2DBg0q0/5xcXEMHDjwrI9/8skn3HDDDRUVXo1itVr55ZdfWLhwIQcOHCAnJ4fAwEAuueQSrrnmGu688078/f2rNKYpU6YQHBzM7bffXqXnrUonT55k2rRpbNiwgYSEBMxmM6GhobRv356rrrqKW2+9lYCAAFeHWSn0mnO97Oxsxo4dy0MPPURYWJhz+9ixYxk3bhyzZs2ic+fOxZ4zefJkRo8eTbdu3ZgwYQLBwcEVGtP06dN58803AdiwYQPh4eHOxwYNGsQll1zCRx99xLhx4yr0vCIiIiJSOyjpJyIiIiIX7MYbb+SKK67AMAxyc3M5fvw4y5cv55dffuHSSy9lzJgxFd5hWl5Llixx6fnPZdy4cdx2221lTvo5XHbZZdxyyy0ltnfr1q2CIqtZ0tLS+Oc//8nOnTvp2rUrDz74IJGRkWRnZ7Nlyxbef/99tm7dypgxY6o0rqlTp9K4ceMam4CZM2cOr7/+Ol5eXlx33XUMHToUPz8/UlJS2LJlC2+//TbLly9n0qRJrg61wtXm11zjxo3ZvXs3np6elXaOspo+fTrZ2dncf//9Zdr/s88+Y/z48fTv359x48ZVeFI2MTGRjz/+mICAAPLy8krdZ9iwYbz88sscOXKEtm3bVuj5RURERKTmU9JPRERERC5Yhw4dSiSfXnnlFf773/8yefJkRowYwTfffOOi6Ir4+Pi49PyVoUWLFqUm/aQkwzB45pln2LlzJ6+++mqJUq8PP/wwJ06cqNbJ4QuRk5NDUFCQy86/YcMG/vOf/9C2bVsmTpxI/fr1iz3+xBNPEBsby6JFi1wUYeWp7a85k8mEr6+vq8PBbrczY8YMLr/88mKz6UpjGAbvvPMO33//Pddffz0ffvhhpbQdb731Fs2aNaNNmzbMmzev1H2uvvpq3njjDX766Sf+7//+r8JjEBEREZGaTWv6iYiIiEiF8vT0ZOTIkfTs2ZM1a9awdevWYo/HxcXx4osvcumll9KpUycGDRrEJ598Qn5+frH9xo4dS1RUFEePHuXdd9+lf//+zhkzx44dA+C3337jtttuo0uXLgwYMIAZM2aUiKe0Nf0c26Kjo3nsscfo3r07PXv25JlnniE5ObnYvomJiXzwwQfccsst9OrVi86dOzN48GAmTJiAzWYrtu+cOXOIiopiw4YNTJo0iUGDBtGpUyeuvfZa5s6dW+xn4Fi/a+7cuURFRTm/Lta51tNy/Ezj4uKc2xxriWVnZ/P666/Tr18/OnfuzNChQ9m1a1eJY2RmZvLqq6/Sp08funXrxgMPPMDevXsvKNYFCxZw00030blzZ6688krGjh2L1Wp1Pv7OO+8QFRVV6vpWSUlJdOjQgVdeeeWc51i5ciVbtmxh8ODBZ13bsUWLFjzxxBPnPM7Z1oor7edtt9uZMmUKN910E927d6dHjx5ce+21jBo1CovFAhStDxcfH8/mzZuL/f7P/N3s2bOHp556ij59+jhfR+PHjy/2MzozttjYWJ555hl69+5Nz549z3k9le2///0vUDRz6q8JP4emTZvy+OOPl+l4W7Zs4eGHH6Znz5506dKF2267jZkzZ551/9jYWP75z3/Ss2dPevTowVNPPUVsbGyxfex2O+PHj+e+++7jsssuo1OnTlx55ZW8/vrrF7X+W21/zZUW35nbVq5cyR133EHnzp3p378/o0ePLnF8gKVLl3LzzTc73x/GjRvH+vXriYqKYs6cOef82QHs3r2b+Ph4/v73v59zP6vVyksvvcT333/PXXfdxSeffFIpCb/ff/+dFStW8Oabb55zFmRgYCA9e/Zk6dKlFR6DiIiIiNR8muknIiIiIpXizjvvZNu2baxatYq//e1vAMTHxzNkyBCys7O59957ad68OZs3b+brr79m+/btTJkyBS+v4h9RX375ZQICAnj88cdJS0tj8uTJDB8+nGeeeYaPPvqIoUOHcscddzBr1ixee+01Wrdu7TzfuSQmJjJs2DAGDRrESy+9xMGDB5kxYwY5OTl8++23zv0OHTrEb7/9xtVXX02zZs2wWCysWbOGjz/+mLi4ON56660Sx/70008pKCjg7rvvxsfHhx9//JGRI0fSrFkzevbsSXh4OB9++CEvvfQSf/vb37jrrrvK9bMtLCwkLS2t2DZvb2/q1KlTruOc6R//+Afh4eE89dRTZGRkMHnyZB577DGWL1/unDFmsVj4xz/+wZ49e7jlllvo2rUrBw8e5OGHHyY0NLRc51uxYgWxsbHcd999REREsGLFCsaNG0dCQgLvv/8+AHfddRfTpk1j9uzZPP/888We/8svv2Cz2RgyZMg5z+PoOC/vz/hijB8/ns8//5yrrrqKoUOH4unpSVxcHCtWrMBsNuPt7c2HH37I+++/T1hYWLHkj2NG0h9//MHTTz9N8+bNeeSRRwgJCWHnzp18/vnnHDhwgM8//7zYOXNzc7n//vvp0aMH//73v0u8PqpSbGws+/bto1evXrRq1eqij7dixQqefvppIiIiePjhhwkKCmLhwoW8+uqrxMXF8dxzzxXbPy8vjwceeIAuXbowYsQIYmJimD59Ort27WLu3LlERkYCRa/nSZMmcc011zBw4ED8/f3Zs2cPs2fPZvv27cyePfuCkj96zZ3dqlWrmD59uvN9e/ny5Xz77beEhIQUi2nRokWMGDGCZs2a8fTTT+Pp6ckvv/zCihUryvwz2bx5MwBdunQ56z6FhYX861//YsWKFQwfPpwXX3yx1P1yc3MpLCws03l9fX0JDAwsti0nJ4e33nqLu+++my5dujB9+vRzHqN79+6sXbuW6OhoWrduXabzioiIiIiAkn4iIiIiUkkcs9bOnKX1ySefkJaWxoQJE5yzL+677z5Gjx7Nt99+y9y5c0skcSIjIxk/fjwmkwmAsLAw3n33Xd566y0WLlxIw4YNARg8eDB///vfmT59epmSfjExMXz66acMHjzYuc3Dw4Pp06dz7NgxZ7Kid+/eLF++3Hl+gIceeogXX3yRmTNn8vTTT1OvXr1ixzabzcyaNcuZMLjuuusYOHAgP/zwAz179iQgIIBbbrmFl156iaZNm5a7VOesWbOYNWtWsW1du3bl559/LtdxztShQwfeeOMN5/etW7fm3//+NwsWLGDo0KFA0UxGx0ygZ555pti+77//Po0bNy7z+Q4ePMisWbPo2LEjAPfffz9PP/00c+bM4e6776Zbt25ccskldO/enblz5/Lvf/+72OyY2bNn07p1a3r06HHO8xw5cgSA9u3blzm2i7Vs2TJat27NV199VWz7Cy+84Pz/LbfcwpgxY4iIiCj
"text/plain": [
"<Figure size 1800x500 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ============================================================\n",
"# DONUT CHARTS — Geographic, Asset-type, Fund distribution\n",
"# ============================================================\n",
"\n",
"def plot_donuts_by_cluster(dfc, var_col, cluster_col, title,\n",
" top_n=6, figsize=(18, 5), save_path=None):\n",
" clusters = sorted(dfc[cluster_col].unique())\n",
" n_clusters = len(clusters)\n",
" fig, axes = plt.subplots(1, n_clusters, figsize=figsize)\n",
"\n",
" all_cats = dfc[var_col].fillna(\"Unknown\").value_counts().head(top_n + 1).index.tolist()\n",
" palette = sns.color_palette(\"tab10\", len(all_cats))\n",
" color_map = {cat: palette[i] for i, cat in enumerate(all_cats)}\n",
"\n",
" for ax, cluster in zip(axes, clusters):\n",
" subset = dfc[dfc[cluster_col] == cluster][var_col].fillna(\"Unknown\")\n",
" counts = subset.value_counts()\n",
" counts = counts.reindex(all_cats, fill_value=0)\n",
" counts = counts[counts > 0]\n",
" colors = [color_map[c] for c in counts.index]\n",
" n = subset.shape[0]\n",
"\n",
" wedges, texts, autotexts = ax.pie(\n",
" counts.values,\n",
" labels=None,\n",
" colors=colors,\n",
" autopct=lambda p: f\"{p:.1f}%\" if p > 5 else \"\",\n",
" pctdistance=0.75,\n",
" wedgeprops=dict(width=0.5, edgecolor=\"white\", linewidth=1.5),\n",
" startangle=90,\n",
" )\n",
" for autotext in autotexts:\n",
" autotext.set_fontsize(8)\n",
"\n",
" ax.set_title(f\"Cluster {cluster}\\n(n={n:,})\", fontsize=11, pad=8)\n",
"\n",
" legend_patches = [\n",
" plt.matplotlib.patches.Patch(color=color_map[c], label=c)\n",
" for c in all_cats if c in dfc[var_col].fillna(\"Unknown\").values\n",
" ]\n",
" fig.legend(\n",
" handles=legend_patches,\n",
" loc=\"lower center\",\n",
" ncol=min(len(legend_patches), 7),\n",
" fontsize=15,\n",
" frameon=False,\n",
" bbox_to_anchor=(0.5, -0.05)\n",
" )\n",
" fig.suptitle(title, fontsize=13, y=1.02)\n",
" plt.tight_layout()\n",
"\n",
" if save_path:\n",
" fig.savefig(save_path, dpi=150, bbox_inches=\"tight\")\n",
" print(f\"Saved: {save_path}\")\n",
"\n",
" plt.show()\n",
"\n",
"\n",
"# ── 1. Geographic distribution ────────────────────────────────────────────\n",
"dfc[\"country_grp_clean\"] = dfc[\"country_grp\"].str.strip().str.title()\n",
"top_countries = dfc[\"country_grp_clean\"].value_counts().head(6).index\n",
"dfc[\"country_grp_clean\"] = np.where(\n",
" dfc[\"country_grp_clean\"].isin(top_countries),\n",
" dfc[\"country_grp_clean\"], \"Other\"\n",
")\n",
"\n",
"plot_donuts_by_cluster(\n",
" dfc,\n",
" var_col = \"country_grp_clean\",\n",
" cluster_col = \"cluster_k4\",\n",
" title = \"Geographic Distribution by Cluster — Global Clustering (K=4)\",\n",
" top_n = 6,\n",
" figsize = (18, 5),\n",
" save_path = \"Figures/donut_geography_global.png\"\n",
")\n",
"\n",
"# ── 2. Asset-type distribution ────────────────────────────────────────────\n",
"asset_cols = [c for c in dfc.columns if c.startswith(\"share_asset_\")]\n",
"\n",
"if asset_cols:\n",
" dfc[\"dominant_asset\"] = (\n",
" dfc[asset_cols]\n",
" .rename(columns=lambda c: c.replace(\"share_asset_\", \"\")\n",
" .replace(\"_\", \" \").title())\n",
" .idxmax(axis=1)\n",
" )\n",
" dfc.loc[dfc[asset_cols].sum(axis=1) == 0, \"dominant_asset\"] = \"Not exposed\"\n",
"\n",
" plot_donuts_by_cluster(\n",
" dfc,\n",
" var_col = \"dominant_asset\",\n",
" cluster_col = \"cluster_k4\",\n",
" title = \"Dominant Asset Type by Cluster — Global Clustering (K=4)\",\n",
" top_n = 6,\n",
" figsize = (18, 5),\n",
" save_path = \"Figures/donut_asset_global.png\"\n",
" )\n",
"\n",
"# ── 3. Fund distribution ──────────────────────────────────────────────────\n",
"fund_name_map = {\n",
" \"share_fund_carmignac_patrimoine\": \"Patrimoine\",\n",
" \"share_fund_carmignac_sécurité\": \"Sécurité\",\n",
" \"share_fund_carmignac_investissement\": \"Investissement\",\n",
" \"share_fund_carmignac_portfolio_sécurité\": \"Port. Sécurité\",\n",
" \"share_fund_carmignac_portfolio_flexible_bond\": \"Port. Flexible Bond\",\n",
" \"share_fund_carmignac_emergents\": \"Emergents\",\n",
" \"share_fund_carmignac_portfolio_patrimoine\": \"Port. Patrimoine\",\n",
" \"share_fund_carmignac_portfolio_global_bond\": \"Port. Global Bond\",\n",
" \"share_fund_carmignac_portfolio_global_bon\": \"Port. Global Bond\",\n",
" \"share_fund_carmignac_portfolio_credit\": \"Port. Credit\",\n",
" \"share_fund_carmignac_portfolio_emerging_patrimoine\": \"Port. Emerg. Patrim.\",\n",
" \"share_fund_carmignac_portfolio_grande_europe\": \"Port. Grande Europe\",\n",
" \"share_fund_carmignac_court_terme\": \"Court Terme\",\n",
" \"share_fund_carmignac_portfolio_long-short_european\": \"Port. L/S Europe\",\n",
" \"share_fund_carmignac_portfolio_climate_transition\": \"Port. Climate Trans.\",\n",
" \"share_fund_carmignac_credit_2027\": \"Credit 2027\",\n",
"}\n",
"\n",
"fund_cols = [c for c in dfc.columns if c.startswith(\"share_fund_\")]\n",
"\n",
"if fund_cols:\n",
" fund_renamed = dfc[fund_cols].rename(columns=fund_name_map)\n",
" dfc[\"dominant_fund\"] = fund_renamed.idxmax(axis=1)\n",
" dfc.loc[dfc[fund_cols].sum(axis=1) == 0, \"dominant_fund\"] = \"Not exposed\"\n",
"\n",
" plot_donuts_by_cluster(\n",
" dfc,\n",
" var_col = \"dominant_fund\",\n",
" cluster_col = \"cluster_k4\",\n",
" title = \"Dominant Fund by Cluster — Global Clustering (K=4)\",\n",
" top_n = 7,\n",
" figsize = (18, 5),\n",
" save_path = \"Figures/donut_fund_global.png\"\n",
" )\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}