2026-02-02 00:52:00 +01:00
{
"cells": [
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 23,
2026-02-02 00:52:00 +01:00
"id": "338730e2-a6de-4d4f-b438-efe3feb139ab",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
2026-03-10 18:45:51 +01:00
"import plotly.graph_objects as go\n",
"import matplotlib.pyplot as plt"
2026-02-02 00:52:00 +01:00
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cfd11919-0941-400e-a516-72871881f733",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2026-03-10 18:45:51 +01:00
"/tmp/ipykernel_1311/1940519970.py:1: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2026-02-02 00:52:00 +01:00
" stocks=pd.read_csv('stocks.csv')\n",
2026-03-10 18:45:51 +01:00
"/tmp/ipykernel_1311/1940519970.py:2: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2026-02-02 00:52:00 +01:00
" flows = pd.read_csv('flows.csv')\n"
]
}
],
"source": [
"stocks=pd.read_csv('stocks.csv')\n",
"flows = pd.read_csv('flows.csv')"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 3,
2026-02-02 00:52:00 +01:00
"id": "b99e3402-fe26-4f4e-8c1c-5f07847bce94",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2026-03-10 18:45:51 +01:00
"/tmp/ipykernel_1311/3613746644.py:1: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" merged = pd.read_csv('merged.csv')\n"
2026-02-02 00:52:00 +01:00
]
}
],
"source": [
"merged = pd.read_csv('merged.csv')"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 4,
2026-02-02 00:52:00 +01:00
"id": "34e5a815-7269-4312-bfe6-e2cd12595e57",
"metadata": {},
2026-03-10 18:45:51 +01:00
"outputs": [],
2026-02-02 00:52:00 +01:00
"source": [
"# 1. Prepare stock dataset ISIN-by-ISIN\n",
"stocks_isin = stocks[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
"\n",
"stocks_isin = stocks_isin.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"# 2. Prepare flows dataset ISIN-by-ISIN\n",
"flows_isin = flows[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
"\n",
"flows_isin = (\n",
" flows_isin\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
" .reset_index()\n",
")\n",
"\n",
"# 3. Merge stocks & flows ISIN-by-ISIN\n",
"merged_isin = stocks_isin.merge(\n",
" flows_isin,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# 4. Compute expected stock per ISIN for each account\n",
"merged_isin[\"prev_stock\"] = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - AUM\"]\n",
" .shift(1)\n",
")\n",
"\n",
"merged_isin[\"prev_netflows\"] = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - NetFlows\"]\n",
" .shift(1)\n",
" .fillna(0)\n",
")\n",
"\n",
"merged_isin[\"expected_stock\"] = (\n",
" merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
")\n",
"\n",
"# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
"TOL = 1e-6\n",
"\n",
"merged_isin[\"gap\"] = (\n",
" merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
")\n",
"\n",
"merged_isin[\"rupture_flag\"] = (\n",
" merged_isin[\"prev_stock\"].notna()\n",
" & (merged_isin[\"gap\"].abs() > TOL)\n",
")\n",
"\n",
"# 6. Summarize ruptures per (Account, ISIN)\n",
"rupture_isin_summary = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# Sort by worst ISIN trajectories\n",
"rupture_isin_summary = rupture_isin_summary.sort_values(\n",
" \"rupture_ratio\",\n",
" ascending=False\n",
")"
]
},
{
"cell_type": "markdown",
"id": "16213cb2-07d8-4e82-b9bb-252554ec47b9",
"metadata": {},
"source": [
"# Détection des ruptures"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 5,
2026-02-02 00:52:00 +01:00
"id": "78c3db70-e0b6-4de2-92ca-e29cf5bf6bd1",
"metadata": {},
"outputs": [],
"source": [
"# ============================================================\n",
"# AUM– FLOW CONSISTENCY & RUPTURE DETECTION (FINAL VERSION)\n",
"# ============================================================\n",
"# ------------------------------------------------------------\n",
"# 1. Keep relevant columns\n",
"# ------------------------------------------------------------\n",
"stocks_clean = stocks[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"flows_clean = flows[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Date formatting\n",
"# ------------------------------------------------------------\n",
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Aggregate flows per day\n",
"# ------------------------------------------------------------\n",
"flows_clean = (\n",
" flows_clean\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 4. Merge stocks and flows\n",
"# ------------------------------------------------------------\n",
"df = stocks_clean.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 5. Sort and compute expected stock\n",
"# ------------------------------------------------------------\n",
"df = df.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"prev_stock\"] = df.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df[\"prev_flows\"] = df.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 6. Compute gaps\n",
"# ------------------------------------------------------------\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"df[\"gap_rel\"] = df[\"gap_abs\"] / df[\"expected_stock\"].abs().clip(lower=1)\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 7. Detect ruptures (economic rule)\n",
"# ------------------------------------------------------------\n",
"TAU_ABS = 10.0 # minimum absolute gap (shares)\n",
"TAU_REL = 0.005 # minimum relative gap (0.5%)\n",
"\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_stock\"].notna()\n",
" & (df[\"gap_abs\"] > TAU_ABS)\n",
" & (df[\"gap_rel\"] > TAU_REL)\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 8. Remove end-of-sample false positives (edge effects)\n",
"# ------------------------------------------------------------\n",
"last_date = df[\"Centralisation Date\"].max()\n",
"\n",
"df[\"rupture_flag\"] = np.where(\n",
" (df[\"rupture_flag\"]) & (df[\"Centralisation Date\"] == last_date),\n",
" False,\n",
" df[\"rupture_flag\"]\n",
")\n"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 6,
2026-02-02 00:52:00 +01:00
"id": "a9783dc1-e225-4142-8b6f-6f9e620b4b3d",
"metadata": {},
"outputs": [],
"source": [
"# ------------------------------------------------------------\n",
"# 9. ISIN-level summary (AFTER CLEANING)\n",
"# ------------------------------------------------------------\n",
"rupture_isin_summary = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap_abs\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 10. Account-level summary (AFTER CLEANING)\n",
"# ------------------------------------------------------------\n",
"rupture_summary = (\n",
" df.groupby(\"Registrar Account - ID\")\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap_abs\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 11. Outputs\n",
"# ------------------------------------------------------------\n",
"df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
"rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
"rupture_summary.to_csv(\"rupture_summary.csv\", index=False)"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 7,
2026-02-02 00:52:00 +01:00
"id": "f5b62558-c27a-4428-a193-8b97e0ce6b6a",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hole": 0.45,
"hoverinfo": "label+percent",
"labels": [
"Clean / quasi-clean (≤1%)",
"Moderate (1– 10%)",
"High (10– 30%)",
"Severe (>30%)"
],
"textinfo": "percent",
"type": "pie",
"values": {
2026-03-10 18:45:51 +01:00
"bdata": "AAAAAACASEAAAAAAAIBBQAAAAAAAAChAZmZmZmZmEEA=",
2026-02-02 00:52:00 +01:00
"dtype": "f8"
}
}
],
"layout": {
"legend": {
"orientation": "h",
"title": {
"text": "Rupture ratio"
},
"x": 0.5,
"xanchor": "center",
"y": -0.15,
"yanchor": "top"
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermap": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermap"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
}
}
},
2026-03-10 18:45:51 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzkAAAFoCAYAAAB0XzViAAAQAElEQVR4AezdB5wcdf3/8c+Wy6WTTggtARJ670QhoFQTqqEjKCCISFVKUIyKoXdEEFD8I/ATREFKaAIWeu+B0CEhCQTSLneX2/Lf94Q5Jpu9u929LVNePPhmdme+853v9/md3Z3PfGfm4ln+QwABBBBAAAEEEEAAAQRCJBA3/kMAgQICzEIAAQQQQAABBBAIqgBBTlB7jnojgAAC9RBgmwgggAACCARAgCAnAJ1EFRFAAAEEEEDA3wLUDgEE/CVAkOOv/qA2CCCAAAIIIIAAAgiERaBu7SDIqRs9G0YAAQQQQAABBBBAAIFqCBDkVEOVMisnQEkIIIAAAggggAACCJQoQJBTIhjZEUAAAT8IUAcEEEAAAQQQ6FiAIKdjG5YggAACCCCAQLAEqC0CCCDgCBDkOAz8gwACCCCAAAIIIIBAWAWi1y6CnOj1OS1GAAEEEEAAAQQQQCDUAgQ5oe7eyjWOkhBAAAEEEEAAAQQQCIoAQU5Qeop6IoCAHwWoEwIIIIAAAgj4UIAgx4edQpUQQAABBBAItgC1RwABBOorQJBTX3+2jgACCCCAAAIIIBAVAdpZMwGCnJpRsyEEEEAAAQQQQAABBBCohQBBTi2UK7cNSkIAAQQQQAABBBBAAIEuBAhyugBiMQIIBEGAOiKAAAIIIIAAAl8LEOR8bcErBBBAAAEEwiVAaxBAAIGIChDkRLTjaTYCCCCAAAIIIBBVAdodfgGCnPD3MS1EAAEEEEAAAQQQQCBSAgQ5ZXU3KyGAAAIIIIAAAggggIBfBQhy/Noz1AuBIApQZwQQQAABBBBAwAcCBDk+6ASqgAACCCAQbgFahwACCCBQWwGCnNp6szUEEEAAAQQQQACBpQL8i0DVBAhyqkZLwQgggAACCCCAAAIIIFAPgWAHOfUQY5sIIIAAAggggAACCCDgawGCHF93D5VDoDwB1kIAAQQQQAABBKIsQJAT5d6n7QgggEC0BGgtAggggEBEBAhyItLRNBMBBBBAAAEEECgswFwEwidAkBO+PqVFCCCAAAIIIIAAAghEWqAiQU6kBWk8AggggAACCCCAAAII+EqAIMdX3UFlQiZAcxBAAAEEEEAAAQTqIECQUwd0NokAAghEW4DWI4AAAgggUF0Bgpzq+lI6AggggAACCCBQnAC5EECgYgIEORWjpCAEEEAAAQQQQAABBBCotEA55RHklKPGOggggAACCCCAAAIIIOBbAYIc33YNFaucACUhgAACCCCAAAIIREmAICdKvU1bEUAAAa8ArxFAAAEEEAipAEFOSDuWZiGAAAIIIIBAeQKshQACwRcgyAl+H9ICBBBAAAEEEEAAAQSqLRCo8glyAtVdVBYBBBBAAAEEEEAAAQS6EiDI6UqI5ZUToCQEEEAAAQQQQAABBGogQJBTA2Q2gQACCHQmwDIEEEAAAQQQqKwAQU5lPSkNAQQQQAABBCojQCkIIIBA2QIEOWXTsSICCCCAAAIIIIAAArUWYHvFCBDkFKNEHgQQQAABBBBAAAEEEAiMAEFOYLqqchWlJAQQQAABBBBAAAEEwixAkBPm3qVtCCBQigB5EUAAAQQQQCAkAgQ5IelImoEAAggggEB1BCgVAQQQCJ4AQU7w+owaI4AAAggggAACCNRbgO37WoAgx9fdQ+UQQAABBBBAAAEEEECgVAGCnFLFKpefkhBAAAEEEEAAAQQQQKAKAgQ5VUClSAQQ6I4A6yKAAAIIIIAAAt0TIMjpnh9rI4AAAgggUBsBtoIAAgggULQAQU7RVGREAAEEEEAAAQQQ8JsA9UGgkABBTiEV5iGAAAIIIIAAAggggEBgBQhyLLB9R8URQAABBBBAAAEEEECggABBTgEUZiGAgJmBgAACCCCAAAIIBFSAICegHUe1EUAAAQTqI8BWEUAAAQT8L0CQ4/8+ooYIIIAAAggggIDfBagfAr4SIMjxVXdQGQQQQAABBBBAAAEEEOiugH+CnO62hPURQAABBBBAAAEEEEAAgZwAQU4Ogf8R8LMAdUMAAQQQQAABBBAoTYAgpzQvciOAAAII+EOAWiCAAAIIINChAEFOhzQsQAABBBBAAAEEgiZAfRFAQAIEOVIgIYAAAggggAACCCCAQGgElgtyQtMyGoIAAggggAACCCCAAAKRFCDIiWS30+gyBFgFAQQQQAABBBBAICACBDkB6SiqiQACCPhTgFohgAACCCDgPwGCHP/1CTVCAAEEEEAAgaALUH8EEKirAEFOXfnZOAIIIIAAAggggAAC0RGoVUsJcmolzXYQQAABBBBAAAEEEECgJgIEOTVhZiOVE6AkBBBAAAEEEEAAAQQ6FyDI6dyHpQgggEAwBKglAggggAACCLQLEOS0U/ACAQQQQAABBMImQHsQQCCaAgQ50ex3Wo0AAggggAACCCAQXYHQt5wgJ/RdTAMRQAABBBBAAAEEEIiWAEFOtPq7cq2lJAQQQAABBBBAAAEEfCpAkOPTjqFaCCAQTAFqjQACCCCAAAL1FyDIqX8fUAMEEEAAAQTCLkD7EEAAgZoKEOTUlJuNIYAAAggggAACCCDgCjCtlgBBTrVkKRcBBBBAAAEEEEAAAQTqIkCQUxf2ym2UkhBAAAEEEEAAAQQQQGBZAYKcZT14hwAC4RCgFQgggAACCCAQYQGCnAh3Pk1HAAEEEIiaAO1FAAEEoiFAkBONfqaVCCCAAAIIIIAAAh0JMD90AgQ5oetSGoQAAggggAACCCCAQLQFCHIq0/+UggACCCCAAAIIIIAAAj4RIMjxSUdQDQTCKUCrEEAAAQQQQACB2gsQ5NTenC0igAACCERdgPYjgAACCFRVgCCnqrwUjgACCCCAAAIIIFCsAPkQqJQAQU6lJCkHAQQQQAABBBBAAAEEfCEQsiDHF6ZUAgEEEEAAAQQQQAABBOooQJBTR3w2jUDNBNgQAggggAACCCAQIQGCnAh1Nk1FAAEEEFhWgHcIIIAAAuEUIMgJZ7/SKgQQQAABBBBAoFwB1kMg8AIEOYHvQhqAAAIIIIAAAggggAACXoHqBDneLfAaAQQQQAABBBBAAAEEEKihAEFODbHZFAIIIIAAAggggAACCFRfgCCn+sZsAQEEEAilQPqtV63t2f/aksfus9Z7b7OW226w5hsvt8W/+601XXCGLZr8E1v4syNswXH72bwjdrWXzv2jHX9am/3sl232y/NSdt7lKbv82pRd9//SdvPtabvzvrQ98p+Mvfxa1j6ZmbXWJaFko1EIIIAAAjUQiNdgG2wCAQQQQCDIAtmsZT5535Y8eq8tvu4iW3ja923egTvYwl/8yJouPNMWXz3Fmv98hbX87U/Wet/ttuTfU63tuf9Z6o0XLf3hO5b5fLbZ4iaLZTPW0mr25TyzGZ9m7Z33svbqG1l7+vmMPfq/jN3zQMZuuSNtV16Xssnnp+zHP2uzn5ze5ry+8g8pu+VvaXvs8Yx9misuyJzUHYHiBciJAALlCsTLXZH1EEAAAQTCKZBdMM8ZoWm59Vpb9OsTnVGYBaccZot/f64teehOS38w3SyTrknjm1vMGdV5+fWsPfLfjP3ltrT9YkqbnXxWm13zp5Qzb2YuYKpJZdgIAggggIA/BIqoBUFOEUhkQQABBMIukHrtBVt8zXk2/0f72vyjxjsjNC3/uMlSrz1v1rzYd81fuMjsuZeyzujO2eel7MQz2+x3N6ScEaEm/1XXd35UCAEEEAi7AEFO2HuY9hUSYB4CCOQEdClZ81+utvnH7pMbsTnBljxyj2XnzsktCd7/CmxefCXr3NtzSm6UR/f6PPVcxlpbg9cWaowAAggg0H0BgpzuG1ICAgggEBiB7OezreXOm2zBqYc5DwVo/ectlv3is6/qH45JOmPOvT7X35S2k3IBzzU3pk0BUKo2V9iFA5FWIIAAAgEXIMgJeAdSfQQQQKArgeyihc69NIvOPs7mH7eftdxyrWU+fr+r1UKxvK3N7LkXM86lbCed2WY3/y1tc78IRdNoRC0F2BYCCAROgCAncF1GhRFAAIH
2026-02-02 00:52:00 +01:00
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Base\n",
"rs = rupture_summary.copy()\n",
"\n",
"# Classes simplifiées\n",
"bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (1– 10%)\",\n",
" \"High (10– 30%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
"\n",
"rs[\"rupture_class\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"# Distribution en %\n",
"dist = (\n",
" rs[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"# Donut chart\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=dist.index,\n",
" values=dist.values,\n",
" hole=0.45,\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" legend=dict(\n",
" orientation=\"h\", # horizontale\n",
" yanchor=\"top\",\n",
" y=-0.15, # en dessous du graphe\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "markdown",
"id": "e52cd650-df05-490d-af59-e66c058f955d",
"metadata": {},
"source": [
"## AUM– FLOW CONSISTENCY & DISCONTINUITY DETECTION"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 8,
2026-02-02 00:52:00 +01:00
"id": "a7efe494-f5fa-43f8-8446-942fc2d3bd4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Detection threshold epsilon (trimmed 99th percentile): 40.03%\n"
]
}
],
"source": [
"# ------------------------------------------------------------\n",
"# 1. Keep relevant columns\n",
"# ------------------------------------------------------------\n",
"stocks_clean = stocks[\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - AUM\"]\n",
"].copy()\n",
"\n",
"flows_clean = flows[\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - NetFlows\"]\n",
"].copy()\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Date formatting\n",
"# ------------------------------------------------------------\n",
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Aggregate flows per day\n",
"# ------------------------------------------------------------\n",
"flows_clean = (\n",
" flows_clean\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 4. Merge stocks and flows\n",
"# ------------------------------------------------------------\n",
"df = stocks_clean.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# ------------------------------------------------------------\n",
"# 5. Sort and reconstruct expected stock\n",
"# ------------------------------------------------------------\n",
"df = df.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"prev_stock\"] = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" [\"Quantity - AUM\"]\n",
" .shift(1)\n",
")\n",
"\n",
"df[\"prev_flows\"] = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" [\"Quantity - NetFlows\"]\n",
" .shift(1)\n",
" .fillna(0)\n",
")\n",
"\n",
"df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
"\n",
"# ------------------------------------------------------------\n",
"# 6. Compute accounting gaps\n",
"# ------------------------------------------------------------\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"\n",
"# Relative gap normalised by previous stock\n",
"df[\"gap_rel\"] = (\n",
" df[\"gap_abs\"] /\n",
" df[\"prev_stock\"].abs().replace(0, np.nan)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 7. Calibration sample (valid regime)\n",
"# ------------------------------------------------------------\n",
"valid_gaps = df.loc[\n",
" df[\"gap_rel\"].notna() & (df[\"prev_stock\"] > 0),\n",
" \"gap_rel\"\n",
"]\n",
"\n",
"# ------------------------------------------------------------\n",
"# 8. Robust, data-driven threshold (epsilon)\n",
"# ------------------------------------------------------------\n",
"# Step 1 — trim extreme breaks to avoid calibrating on resets\n",
"gap_rel_trimmed = valid_gaps[\n",
" valid_gaps <= valid_gaps.quantile(0.90)\n",
"]\n",
"\n",
"# Step 2 — define epsilon on the upper tail of the trimmed distribution\n",
"EPSILON = gap_rel_trimmed.quantile(0.99)\n",
"\n",
"# ------------------------------------------------------------\n",
"# 9. Detect discontinuities (diagnostic rule)\n",
"# ------------------------------------------------------------\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_stock\"].notna()\n",
" & (df[\"prev_stock\"] > 0)\n",
" & (df[\"gap_rel\"] > EPSILON)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 10. Remove end-of-sample edge effects\n",
"# ------------------------------------------------------------\n",
"last_date = df[\"Centralisation Date\"].max()\n",
"\n",
"df.loc[\n",
" (df[\"rupture_flag\"]) &\n",
" (df[\"Centralisation Date\"] == last_date),\n",
" \"rupture_flag\"\n",
"] = False\n",
"\n",
"# ------------------------------------------------------------\n",
"# 11. ISIN-level summary\n",
"# ------------------------------------------------------------\n",
"rupture_isin_summary = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap_abs=(\"gap_abs\", \"max\"),\n",
" max_gap_rel=(\"gap_rel\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 12. Account-level summary\n",
"# ------------------------------------------------------------\n",
"rupture_summary = (\n",
" df.groupby(\"Registrar Account - ID\")\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap_abs=(\"gap_abs\", \"max\"),\n",
" max_gap_rel=(\"gap_rel\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 13. Outputs\n",
"# ------------------------------------------------------------\n",
"df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
"rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
"rupture_summary.to_csv(\"rupture_summary.csv\", index=False)\n",
"\n",
"print(f\"Detection threshold epsilon (trimmed 99th percentile): {EPSILON:.2%}\")\n"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 10,
2026-02-02 00:52:00 +01:00
"id": "d7454212-1493-4715-a436-c331931f92fa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Product - Isin</th>\n",
" <th>n_ruptures</th>\n",
" <th>total_obs</th>\n",
" <th>rupture_ratio</th>\n",
2026-03-10 18:45:51 +01:00
" <th>max_gap_abs</th>\n",
" <th>max_gap_rel</th>\n",
2026-02-02 00:52:00 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2026-03-10 18:45:51 +01:00
" <th>59545</th>\n",
" <td>200127410</td>\n",
" <td>FR0010135103</td>\n",
" <td>384</td>\n",
" <td>436</td>\n",
" <td>0.880734</td>\n",
" <td>295985.42</td>\n",
" <td>3371.158214</td>\n",
2026-02-02 00:52:00 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID Product - Isin n_ruptures total_obs \\\n",
2026-03-10 18:45:51 +01:00
"59545 200127410 FR0010135103 384 436 \n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
" rupture_ratio max_gap_abs max_gap_rel \n",
"59545 0.880734 295985.42 3371.158214 "
2026-02-02 00:52:00 +01:00
]
},
2026-03-10 18:45:51 +01:00
"execution_count": 10,
2026-02-02 00:52:00 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rupture_isin_summary.sort_values(\"rupture_ratio\").head(1)\n",
2026-03-10 18:45:51 +01:00
"rupture_isin_summary.sort_values(\"rupture_ratio\", ascending=False).head(1)"
2026-02-02 00:52:00 +01:00
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 25,
2026-02-02 00:52:00 +01:00
"id": "b4040847-e0cf-4aa5-966c-d1fbf3935b7d",
"metadata": {},
"outputs": [],
"source": [
"def plot_isin_evolution(df, account_id, isin, title_suffix=\"\"):\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == account_id) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].copy()\n",
"\n",
" if sub.empty:\n",
" print(\"No data for this (account, ISIN).\")\n",
" return\n",
"\n",
" plt.figure(figsize=(10,4))\n",
"\n",
" # Stock observé\n",
" plt.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"Quantity - AUM\"],\n",
" label=\"Observed stock\",\n",
" linewidth=2\n",
" )\n",
"\n",
" # Stock attendu\n",
" plt.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"expected_stock\"],\n",
" label=\"Expected stock\",\n",
" linestyle=\"--\"\n",
" )\n",
"\n",
" # Ruptures\n",
" rupt = sub[sub[\"rupture_flag\"]]\n",
" plt.scatter(\n",
" rupt[\"Centralisation Date\"],\n",
" rupt[\"Quantity - AUM\"],\n",
" color=\"red\",\n",
" label=\"Rupture\",\n",
" zorder=5\n",
" )\n",
"\n",
" plt.title(f\"ISIN {isin} — Account {account_id} {title_suffix}\")\n",
" plt.xlabel(\"Date\")\n",
" plt.ylabel(\"AUM (shares)\")\n",
" plt.legend()\n",
" plt.grid(True)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 27,
2026-02-02 00:52:00 +01:00
"id": "e5d7a5ab-40bd-452d-a6ae-d56e220c592f",
"metadata": {},
"outputs": [
{
2026-03-10 18:45:51 +01:00
"ename": "NameError",
"evalue": "name 'plot_isin_dynamics' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 63\u001b[39m\n\u001b[32m 58\u001b[39m plt.show()\n\u001b[32m 62\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m _, row \u001b[38;5;129;01min\u001b[39;00m sample_isin.iterrows():\n\u001b[32m---> \u001b[39m\u001b[32m63\u001b[39m \u001b[43mplot_isin_dynamics\u001b[49m(\n\u001b[32m 64\u001b[39m df,\n\u001b[32m 65\u001b[39m row[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 66\u001b[39m row[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 67\u001b[39m )\n",
"\u001b[31mNameError\u001b[39m: name 'plot_isin_dynamics' is not defined"
]
2026-02-02 00:52:00 +01:00
}
],
"source": [
"# Option B (alternative) : les plus sévères\n",
"# sample_isin = problematic_isin.sort_values(\n",
"# \"rupture_ratio\", ascending=False\n",
"# ).head(10)\n",
"\n",
2026-03-10 18:45:51 +01:00
"sample_isin = rupture_isin_summary.sort_values(\n",
" \"rupture_ratio\",\n",
" ascending=False\n",
").head(10)\n",
"\n",
2026-02-02 00:52:00 +01:00
"def plot_isin_dynamics_clean(df, account_id, isin):\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == account_id) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].sort_values(\"Centralisation Date\")\n",
"\n",
" if sub.empty:\n",
" return\n",
"\n",
" fig, ax = plt.subplots(figsize=(7.5, 3))\n",
"\n",
" # AUM observé\n",
" ax.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"Quantity - AUM\"],\n",
" label=\"Observed AUM\",\n",
" linewidth=2,\n",
" color=\"black\"\n",
" )\n",
"\n",
" # AUM attendu\n",
" ax.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"expected_stock\"],\n",
" label=\"Flow-implied AUM\",\n",
" linestyle=\"--\",\n",
" linewidth=2,\n",
" color=\"grey\"\n",
" )\n",
"\n",
" # Ruptures\n",
" rupt = sub[sub[\"rupture_flag\"]]\n",
" ax.scatter(\n",
" rupt[\"Centralisation Date\"],\n",
" rupt[\"Quantity - AUM\"],\n",
" color=\"red\",\n",
" s=25,\n",
" zorder=5,\n",
" label=\"Discontinuity\"\n",
" )\n",
"\n",
" ax.set_title(f\"Account {account_id} — ISIN {isin}\", fontsize=11)\n",
" ax.set_xlabel(\"\")\n",
" ax.set_ylabel(\"AUM (shares)\")\n",
" ax.legend(loc=\"best\")\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"\n",
"\n",
"for _, row in sample_isin.iterrows():\n",
" plot_isin_dynamics(\n",
" df,\n",
" row[\"Registrar Account - ID\"],\n",
" row[\"Product - Isin\"]\n",
2026-03-10 18:45:51 +01:00
" )"
2026-02-02 00:52:00 +01:00
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 28,
2026-02-02 00:52:00 +01:00
"id": "aef8ceb9-28a6-4908-ae24-a88d85b64309",
"metadata": {},
"outputs": [
{
2026-03-10 18:45:51 +01:00
"ename": "KeyError",
"evalue": "\"Column(s) ['rupture_flag'] do not exist\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# 1. Aggregate rupture rate over time\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 4\u001b[39m time_stats = (\n\u001b[32m 5\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mtotal_obs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcount\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43mn_ruptures\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m .reset_index()\n\u001b[32m 11\u001b[39m )\n\u001b[32m 13\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mrupture_rate\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 14\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mn_ruptures\u001b[39m\u001b[33m\"\u001b[39m] / time_stats[\u001b[33m\"\u001b[39m\u001b[33mtotal_obs\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 15\u001b[39m )\n\u001b[32m 17\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# 2. Smooth (optional but recommended for readability)\u001b[39;00m\n\u001b[32m 19\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001b[39m, in \u001b[36mDataFrameGroupBy.aggregate\u001b[39m\u001b[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[39m\n\u001b[32m 1429\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m] = engine_kwargs\n\u001b[32m 1431\u001b[39m op = GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args=args, kwargs=kwargs)\n\u001b[32m-> \u001b[39m\u001b[32m1432\u001b[39m result = \u001b[43mop\u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1433\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1434\u001b[39m \u001b[38;5;66;03m# GH #52849\u001b[39;00m\n\u001b[32m 1435\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.as_index \u001b[38;5;129;01mand\u001b[39;00m is_list_like(func):\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:190\u001b[39m, in \u001b[36mApply.agg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 187\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_str()\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(func):\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(func):\n\u001b[32m 192\u001b[39m \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[32m 193\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.agg_list_like()\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:423\u001b[39m, in \u001b[36mApply.agg_dict_like\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 415\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magg_dict_like\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> DataFrame | Series:\n\u001b[32m 416\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 417\u001b[39m \u001b[33;03m Compute aggregation in the case of a dict-like argument.\u001b[39;00m\n\u001b[32m 418\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 421\u001b[39m \u001b[33;03m Result of aggregation.\u001b[39;00m\n\u001b[32m 422\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m423\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_or_apply_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43magg\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001b[39m, in \u001b[36mGroupByApply.agg_or_apply_dict_like\u001b[39m\u001b[34m(self, op_name)\u001b[39m\n\u001b[32m 1598\u001b[39m kwargs.update({\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m: engine, \u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m: engine_kwargs})\n\u001b[32m 1600\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m com.temp_setattr(\n\u001b[32m 1601\u001b[39m obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m, condition=\u001b[38;5;28mhasattr\u001b[39m(obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 1602\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1603\u001b[39m result_index, result_data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompute_dict_like\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1604\u001b[39m \u001b[43m \u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1605\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1606\u001b[39m result = \u001b[38;5;28mself\u001b[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001b[32m 1607\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:462\u001b[39m, in \u001b[36mApply.compute_dict_like\u001b[39m\u001b[34m(self, op_name, selected_obj, selection, kwargs)\u001b[39m\n\u001b[32m 460\u001b[39m is_groupby = \u001b[38;5;28misinstance\u001b[39m(obj, (DataFrameGroupBy, SeriesGroupBy))\n\u001b[32m 461\u001b[39m func = cast(AggFuncTypeDict, \u001b[38;5;28mself\u001b[39m.func)\n\u001b[32m--> \u001b[39m\u001b[32m462\u001b[39m func = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 464\u001b[39m is_non_unique_col = (\n\u001b[32m 465\u001b[39m selected_obj.ndim == \u001b[32m2\u001b[39m\n\u001b[32m 466\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m selected_obj.columns.nunique() < \u001b[38;5;28mlen\u001b[39m(selected_obj.columns)\n\u001b[32m 467\u001b[39m )\n\u001b[32m 469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m selected_obj.ndim == \u001b[32m1\u001b[39m:\n\u001b[32m 470\u001b[39m \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:663\u001b[39m, in \u001b[36mApply.normalize_dictlike_arg\u001b[39m\u001b[34m(self, how, obj, func)\u001b[39m\n\u001b[32m 661\u001b[39m cols = Index(\u001b[38;5;28mlist\u001b[39m(func.keys())).difference(obj.columns, sort=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 662\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m663\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m do not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 665\u001b[39m aggregator_types = (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[32m 667\u001b[39m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[32m 668\u001b[39m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[32m 669\u001b[39m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[32m 670\u001b[39m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
"\u001b[31mKeyError\u001b[39m: \"Column(s) ['rupture_flag'] do not exist\""
]
2026-02-02 00:52:00 +01:00
}
],
"source": [
"# ------------------------------------------------------------\n",
"# 1. Aggregate rupture rate over time\n",
"# ------------------------------------------------------------\n",
"time_stats = (\n",
" df.groupby(\"Centralisation Date\")\n",
" .agg(\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" n_ruptures=(\"rupture_flag\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"time_stats[\"rupture_rate\"] = (\n",
" time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Smooth (optional but recommended for readability)\n",
"# ------------------------------------------------------------\n",
"time_stats[\"rupture_rate_ma\"] = (\n",
" time_stats[\"rupture_rate\"]\n",
" .rolling(window=6, center=True) # 6 periods ≈ half-year\n",
" .mean()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Professional plot\n",
"# ------------------------------------------------------------\n",
"plt.figure(figsize=(12, 5))\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate\"] * 100,\n",
" color=\"lightgray\",\n",
" linewidth=1,\n",
" alpha=0.6,\n",
" label=\"Monthly rupture rate\"\n",
")\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate_ma\"] * 100,\n",
" color=\"#1f77b4\",\n",
" linewidth=2.5,\n",
" label=\"6-month moving average\"\n",
")\n",
"\n",
"plt.ylabel(\"Rupture rate (%)\")\n",
"plt.xlabel(\"Date\")\n",
"\n",
"plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
"plt.legend(frameon=False)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 14,
2026-02-02 00:52:00 +01:00
"id": "d6ee0c24-e14e-4c40-97d4-49879229790c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2026-03-10 18:45:51 +01:00
"/tmp/ipykernel_1311/1047489516.py:6: FutureWarning:\n",
2026-02-02 00:52:00 +01:00
"\n",
"DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"has_reset\n",
"True 64192\n",
"False 15545\n",
"Name: count, dtype: int64"
]
},
2026-03-10 18:45:51 +01:00
"execution_count": 14,
2026-02-02 00:52:00 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"EPS = 1e-6 # seuil numérique\n",
"\n",
"reset_candidates = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .apply(\n",
" lambda g: (\n",
" (g[\"Quantity - AUM\"].abs() < EPS) &\n",
" (g[\"expected_stock\"].abs() < EPS)\n",
" ).any()\n",
" )\n",
" .reset_index(name=\"has_reset\")\n",
")\n",
"\n",
"reset_candidates[\"has_reset\"].value_counts()\n"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 47,
"id": "601f61b8-0115-431d-97de-6ec5a0f1d4f4",
2026-02-02 00:52:00 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Before repair After repair Repaired points\n",
"0 756392 22357 18440\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2026-03-10 18:45:51 +01:00
"/tmp/ipykernel_1311/3061846510.py:66: FutureWarning:\n",
2026-02-02 00:52:00 +01:00
"\n",
"DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
"\n"
]
}
],
"source": [
"GAP_TOL = 1e-6\n",
"REL_GAP_THR = 0.05\n",
"MIN_PERSISTENCE = 3\n",
"\n",
"df = merged_isin.copy().sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"corrected_aum\"] = df[\"Quantity - AUM\"]\n",
"df[\"repair_flag\"] = False\n",
"\n",
"def repair_group(g):\n",
" g = g.copy()\n",
"\n",
" obs = g[\"Quantity - AUM\"].values\n",
" flows = g[\"Quantity - NetFlows\"].values\n",
"\n",
" corrected = obs.copy()\n",
2026-03-10 18:45:51 +01:00
"\n",
" # Initial expected path\n",
2026-02-02 00:52:00 +01:00
" expected = np.empty_like(obs)\n",
" expected[0] = np.nan\n",
"\n",
" for t in range(1, len(obs)):\n",
" expected[t] = corrected[t-1] + flows[t-1]\n",
"\n",
" gap = obs - expected\n",
" rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
"\n",
" idx = None\n",
2026-03-10 18:45:51 +01:00
"\n",
2026-02-02 00:52:00 +01:00
" for i in range(1, len(obs) - MIN_PERSISTENCE):\n",
" if (\n",
2026-03-10 18:45:51 +01:00
" rel_gap[i] > REL_GAP_THR\n",
" and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
" and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
2026-02-02 00:52:00 +01:00
" ):\n",
" idx = i\n",
" break\n",
"\n",
" if idx is None:\n",
" return g\n",
"\n",
2026-03-10 18:45:51 +01:00
" # Apply correction\n",
2026-02-02 00:52:00 +01:00
" shift = gap[idx]\n",
" corrected[idx:] = obs[idx:] - shift\n",
2026-03-10 18:45:51 +01:00
"\n",
2026-02-02 00:52:00 +01:00
" g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
"\n",
2026-03-10 18:45:51 +01:00
" # Rebuild expected stock AFTER correction\n",
2026-02-02 00:52:00 +01:00
" expected_corr = np.empty_like(obs)\n",
" expected_corr[0] = np.nan\n",
2026-03-10 18:45:51 +01:00
"\n",
2026-02-02 00:52:00 +01:00
" for t in range(1, len(obs)):\n",
" expected_corr[t] = corrected[t-1] + flows[t-1]\n",
"\n",
" g[\"corrected_aum\"] = corrected\n",
" g[\"expected_stock_corr\"] = expected_corr\n",
"\n",
" return g\n",
"\n",
2026-03-10 18:45:51 +01:00
"\n",
2026-02-02 00:52:00 +01:00
"df = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"], group_keys=False)\n",
" .apply(repair_group)\n",
")\n",
"\n",
"# Recompute gaps & ruptures\n",
"df[\"gap_before\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_after\"] = df[\"corrected_aum\"] - df[\"expected_stock_corr\"]\n",
"\n",
"df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_TOL\n",
"df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_TOL\n",
"\n",
"summary = pd.DataFrame({\n",
" \"Before repair\": [df[\"rupture_before\"].sum()],\n",
2026-03-10 18:45:51 +01:00
" \"After repair\": [df[\"rupture_after\"].sum()],\n",
2026-02-02 00:52:00 +01:00
" \"Repaired points\": [df[\"repair_flag\"].sum()]\n",
"})\n",
"\n",
2026-03-10 18:45:51 +01:00
"print(summary)"
2026-02-02 00:52:00 +01:00
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": 30,
2026-02-02 00:52:00 +01:00
"id": "62583cfe-a6e7-4931-a63e-4273dca97ff7",
"metadata": {},
"outputs": [
{
2026-03-10 18:45:51 +01:00
"ename": "NameError",
"evalue": "name 'df_final' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplotly\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgraph_objects\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mgo\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df_final = \u001b[43mdf_final\u001b[49m.rename(columns={\n\u001b[32m 5\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 6\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 7\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 8\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 9\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 10\u001b[39m })\n\u001b[32m 12\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 13\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m]\n",
"\u001b[31mNameError\u001b[39m: name 'df_final' is not defined"
]
}
],
"source": [
"import plotly.graph_objects as go\n",
"import pandas as pd\n",
"\n",
"# ============================================================\n",
"# Parameters (fixed epsilon)\n",
"# ============================================================\n",
"GAP_EPS = 100 # fixed tolerance for accounting identity\n",
"\n",
"# ============================================================\n",
"# 1. Define ruptures using a FIXED epsilon\n",
"# ============================================================\n",
"df = df.copy()\n",
"\n",
"df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_EPS\n",
"df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_EPS\n",
"\n",
"# ============================================================\n",
"# 2. Rupture ratios BEFORE repair\n",
"# ============================================================\n",
"rupture_summary_before = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_obs=(\"rupture_before\", \"count\"),\n",
" n_ruptures=(\"rupture_before\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"rupture_summary_before[\"rupture_ratio\"] = (\n",
" rupture_summary_before[\"n_ruptures\"] /\n",
" rupture_summary_before[\"n_obs\"]\n",
")\n",
"\n",
"# ============================================================\n",
"# 3. Rupture ratios AFTER repair\n",
"# ============================================================\n",
"rupture_summary_after = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_obs=(\"rupture_after\", \"count\"),\n",
" n_ruptures=(\"rupture_after\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"rupture_summary_after[\"rupture_ratio\"] = (\n",
" rupture_summary_after[\"n_ruptures\"] /\n",
" rupture_summary_after[\"n_obs\"]\n",
")\n",
"\n",
"# ============================================================\n",
"# 4. Rupture intensity classes (fixed bins)\n",
"# ============================================================\n",
"bins = [0.0, 0.01, 0.10, 0.30, 1.0]\n",
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (1– 10%)\",\n",
" \"High (10– 30%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
"\n",
"rupture_summary_before[\"rupture_class\"] = pd.cut(\n",
" rupture_summary_before[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"rupture_summary_after[\"rupture_class\"] = pd.cut(\n",
" rupture_summary_after[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"# ============================================================\n",
"# 5. Distribution (%)\n",
"# ============================================================\n",
"dist_before = (\n",
" rupture_summary_before[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"dist_after = (\n",
" rupture_summary_after[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"# ============================================================\n",
"# 6. Donut chart: BEFORE vs AFTER (fixed epsilon)\n",
"# ============================================================\n",
"fig = go.Figure()\n",
"\n",
"fig.add_trace(go.Pie(\n",
" labels=dist_before.index,\n",
" values=dist_before.values,\n",
" hole=0.45,\n",
" name=\"Before repair\",\n",
" domain=dict(x=[0.0, 0.48]),\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
"))\n",
"\n",
"fig.add_trace(go.Pie(\n",
" labels=dist_after.index,\n",
" values=dist_after.values,\n",
" hole=0.45,\n",
" name=\"After repair\",\n",
" domain=dict(x=[0.52, 1.0]),\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
"))\n",
"\n",
"fig.update_layout(\n",
" title=\"Distribution of AUM– flow rupture intensity before vs after repair (fixed ε)\",\n",
" annotations=[\n",
" dict(text=\"Before repair\", x=0.24, y=0.5, showarrow=False),\n",
" dict(text=\"After repair\", x=0.76, y=0.5, showarrow=False),\n",
" ],\n",
" legend=dict(\n",
" orientation=\"h\",\n",
" yanchor=\"top\",\n",
" y=-0.15,\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "70cf0a99-bd19-41a9-9574-88647fde09ca",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[31]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 5\u001b[39m df_final = df.copy()\n\u001b[32m 7\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# Core variables (before / after)\u001b[39;00m\n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m df_final = \u001b[43mdf_final\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - AUM\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 15\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 16\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - NetFlows\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 17\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock_corr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_before\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_after\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 22\u001b[39m \u001b[43m]\u001b[49m\u001b[43m]\u001b[49m.rename(columns={\n\u001b[32m 23\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 24\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 25\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 26\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 27\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 28\u001b[39m })\n\u001b[32m 30\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 31\u001b[39m \u001b[38;5;66;03m# Relative gaps\u001b[39;00m\n\u001b[32m 32\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 33\u001b[39m df_final[\u001b[33m\"\u001b[39m\u001b[33mgap_rel_before\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 34\u001b[39m df_final[\u001b[33m\"\u001b[39m\u0
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4117\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m 4118\u001b[39m key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m 4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m 4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m 6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 6210\u001b[39m keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m 6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m 6216\u001b[39m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m 6261\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mKeyError\u001b[39m: \"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\""
]
}
],
"source": [
"# ============================================================\n",
"# FINAL DATASETS AFTER REPAIR\n",
"# ============================================================\n",
"\n",
"df_final = df.copy()\n",
"\n",
"# ------------------------------------------------------------\n",
"# Core variables (before / after)\n",
"# ------------------------------------------------------------\n",
"df_final = df_final[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\",\n",
" \"corrected_aum\",\n",
" \"Quantity - NetFlows\",\n",
" \"expected_stock\",\n",
" \"expected_stock_corr\",\n",
" \"gap_before\",\n",
" \"gap_after\",\n",
" \"repair_flag\"\n",
"]].rename(columns={\n",
" \"Quantity - AUM\": \"aum_raw\",\n",
" \"corrected_aum\": \"aum_repaired\",\n",
" \"Quantity - NetFlows\": \"flows\",\n",
" \"expected_stock\": \"expected_aum_raw\",\n",
" \"expected_stock_corr\": \"expected_aum_repaired\"\n",
"})\n",
"\n",
"# ------------------------------------------------------------\n",
"# Relative gaps\n",
"# ------------------------------------------------------------\n",
"df_final[\"gap_rel_before\"] = (\n",
" df_final[\"gap_before\"].abs() /\n",
" df_final[\"expected_aum_raw\"].abs().clip(lower=1)\n",
")\n",
"\n",
"df_final[\"gap_rel_after\"] = (\n",
" df_final[\"gap_after\"].abs() /\n",
" df_final[\"expected_aum_repaired\"].abs().clip(lower=1)\n",
")\n",
"df_final.to_csv('df_repaired.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "befb2962-73fb-4cb8-b86e-3218ec103204",
"metadata": {},
"outputs": [],
"source": [
"# ============================================================\n",
"# TYPE 3 REPAIR — TEMPORARY RESET TO ZERO (ONE BLOCK)\n",
"# ============================================================\n",
"\n",
"df_type3 = df_repaired.copy()\n",
"df_type3 = df_type3.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"# Create lead/lag variables\n",
"df_type3[\"aum_prev\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df_type3[\"aum_next\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(-1)\n",
"\n",
"df_type3[\"flow_prev\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1)\n",
"\n",
"df_type3[\"flow_next\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(-1)\n",
"\n",
"# ------------------------------------------------------------\n",
"# Detection of temporary reset\n",
"# ------------------------------------------------------------\n",
"df_type3[\"type3_flag\"] = (\n",
" (df_type3[\"Quantity - AUM\"] == 0)\n",
" & (df_type3[\"aum_prev\"] > 0)\n",
" & (df_type3[\"aum_next\"] == df_type3[\"aum_prev\"])\n",
" & (df_type3[\"flow_prev\"].fillna(0) == 0)\n",
" & (df_type3[\"Quantity - NetFlows\"] == 0)\n",
" & (df_type3[\"flow_next\"].fillna(0) == 0)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Repair: smooth the glitch (replace 0 by previous stock)\n",
"# ------------------------------------------------------------\n",
"df_type3.loc[df_type3[\"type3_flag\"], \"Quantity - AUM\"] = (\n",
" df_type3.loc[df_type3[\"type3_flag\"], \"aum_prev\"]\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Recompute temporal chain AFTER repair\n",
"# ------------------------------------------------------------\n",
"df_type3[\"prev_stock\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df_type3[\"prev_flows\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"df_type3[\"expected_stock\"] = (\n",
" df_type3[\"prev_stock\"] + df_type3[\"prev_flows\"]\n",
")\n",
"\n",
"df_type3[\"gap\"] = df_type3[\"Quantity - AUM\"] - df_type3[\"expected_stock\"]\n",
"df_type3[\"gap_abs\"] = df_type3[\"gap\"].abs()\n",
"df_type3[\"gap_rel\"] = (\n",
" df_type3[\"gap_abs\"] /\n",
" df_type3[\"expected_stock\"].abs().clip(lower=1)\n",
")\n",
"\n",
"df_type3[\"rupture_flag\"] = (\n",
" df_type3[\"prev_stock\"].notna()\n",
" & (df_type3[\"gap_abs\"] > TAU_ABS)\n",
" & (df_type3[\"gap_rel\"] > TAU_REL)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Diagnostic output\n",
"# ------------------------------------------------------------\n",
"n_type3 = df_type3[\"type3_flag\"].sum()\n",
"print(f\"Temporary reset glitches repaired (Type 3): {n_type3}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fc44ed4-829f-4a8a-985a-31350bdbdf6d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# ------------------------------------------------------------\n",
"# 1. Sélection des ISIN avec exactement 1 rupture\n",
"# ------------------------------------------------------------\n",
"one_rupture_isin = rupture_isin_summary[\n",
" rupture_isin_summary[\"n_ruptures\"] == 1\n",
"][[\"Registrar Account - ID\", \"Product - Isin\"]].head(100)\n",
"\n",
"results = []\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Boucle de correction test\n",
"# ------------------------------------------------------------\n",
"for _, row in one_rupture_isin.iterrows():\n",
" acc = row[\"Registrar Account - ID\"]\n",
" isin = row[\"Product - Isin\"]\n",
"\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == acc) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].sort_values(\"Centralisation Date\").copy()\n",
"\n",
" # Localiser la rupture\n",
" rupture_idx = sub.index[sub[\"rupture_flag\"]]\n",
"\n",
" if sub.index.get_loc(rupture_idx[0]) > 1:\n",
" #print(sub[[\"Centralisation Date\", \"Quantity - AUM\", \"expected_stock\", \"gap\", \"rupture_flag\"]].head(100))\n",
" continue\n",
"\n",
" # Vérifier si la rupture est à la première date\n",
" first_idx = sub.index[0]\n",
" if rupture_idx[0] != first_idx:\n",
" continue\n",
"\n",
" # ----- Réparation : décaler expected_stock -----\n",
" sub[\"expected_stock_fixed\"] = sub[\"expected_stock\"].shift(-1)\n",
"\n",
" # Recalcul des gaps\n",
" sub[\"gap_fixed\"] = sub[\"Quantity - AUM\"] - sub[\"expected_stock_fixed\"]\n",
" sub[\"gap_abs_fixed\"] = sub[\"gap_fixed\"].abs()\n",
" sub[\"gap_rel_fixed\"] = sub[\"gap_abs_fixed\"] / sub[\"expected_stock_fixed\"].abs().clip(lower=1)\n",
"\n",
" # Recalcul rupture\n",
" sub[\"rupture_fixed\"] = (\n",
" sub[\"expected_stock_fixed\"].notna()\n",
" & (sub[\"gap_abs_fixed\"] > TAU_ABS)\n",
" & (sub[\"gap_rel_fixed\"] > TAU_REL)\n",
" )\n",
"\n",
" results.append({\n",
" \"Registrar Account - ID\": acc,\n",
" \"Product - Isin\": isin,\n",
" \"ruptures_before\": sub[\"rupture_flag\"].sum(),\n",
" \"ruptures_after\": sub[\"rupture_fixed\"].sum()\n",
" })\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Résultats agrégés\n",
"# ------------------------------------------------------------\n",
"repair_test = pd.DataFrame(results)\n",
"\n",
"summary = repair_test.groupby(\n",
" [\"ruptures_before\", \"ruptures_after\"]\n",
").size().reset_index(name=\"count\")\n",
"\n",
"repair_test, summary\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "d85728ca-55ba-4266-b881-23536eee4ba3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['corrected_aum'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[50]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 10\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m] = pd.to_datetime(\n\u001b[32m 11\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 12\u001b[39m )\n\u001b[32m 14\u001b[39m \u001b[38;5;66;03m# 2. Build repair map\u001b[39;00m\n\u001b[32m 15\u001b[39m repair_map = (\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 17\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 22\u001b[39m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 23\u001b[39m .rename(columns={\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM repaired\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m 24\u001b[39m )\n\u001b[32m 26\u001b[39m \u001b[38;5;66;03m# 3. Merge repaired quantities\u001b[39;00m\n\u001b[32m 27\u001b[39m stocks_repaired = stocks_repaired.merge(\n\u001b[32m 28\u001b[39m repair_map,\n\u001b[32m 29\u001b[39m on=[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 30\u001b[39m how=\u001b[33m\"\u001b[39m\u001b[33mleft\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 31\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4117\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m 4118\u001b[39m key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m 4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m 4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m 6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 6210\u001b[39m keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m 6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m 6216\u001b[39m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m 6261\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mKeyError\u001b[39m: \"['corrected_aum'] not in index\""
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# ============================================================\n",
"# Rebuild STOCKS dataset using repaired AUM quantities\n",
"# ============================================================\n",
"\n",
"# 1. Copy original stocks\n",
"stocks_repaired = stocks.copy()\n",
"stocks_repaired[\"Centralisation Date\"] = pd.to_datetime(\n",
" stocks_repaired[\"Centralisation Date\"]\n",
")\n",
"\n",
"# 2. Build repair map\n",
"repair_map = (\n",
" df[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"corrected_aum\",\n",
" \"repair_flag\"\n",
" ]]\n",
" .rename(columns={\"corrected_aum\": \"Quantity - AUM repaired\"})\n",
")\n",
"\n",
"# 3. Merge repaired quantities\n",
"stocks_repaired = stocks_repaired.merge(\n",
" repair_map,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"# 4. Store original quantity\n",
"stocks_repaired[\"Quantity - AUM original\"] = stocks_repaired[\"Quantity - AUM\"]\n",
"\n",
"# 5. Replace Quantity - AUM where repaired\n",
"stocks_repaired[\"Quantity - AUM\"] = np.where(\n",
" stocks_repaired[\"repair_flag\"] == True,\n",
" stocks_repaired[\"Quantity - AUM repaired\"],\n",
" stocks_repaired[\"Quantity - AUM\"]\n",
")\n",
"\n",
"# 6. Recompute monetary values (unit value unchanged)\n",
"stocks_repaired[\"nav_ccy\"] = (\n",
" stocks_repaired[\"Value - AUM CCY\"] /\n",
" stocks_repaired[\"Quantity - AUM original\"]\n",
")\n",
"\n",
"stocks_repaired[\"nav_eur\"] = (\n",
" stocks_repaired[\"Value - AUM €\"] /\n",
" stocks_repaired[\"Quantity - AUM original\"]\n",
")\n",
"\n",
"stocks_repaired[\"Value - AUM CCY\"] = (\n",
" stocks_repaired[\"Quantity - AUM\"] *\n",
" stocks_repaired[\"nav_ccy\"]\n",
")\n",
"\n",
"stocks_repaired[\"Value - AUM €\"] = (\n",
" stocks_repaired[\"Quantity - AUM\"] *\n",
" stocks_repaired[\"nav_eur\"]\n",
")\n",
"\n",
"# 7. Cleanup helper columns\n",
"stocks_repaired = stocks_repaired.drop(\n",
" columns=[\n",
" \"Quantity - AUM repaired\",\n",
" \"Quantity - AUM original\",\n",
" \"nav_ccy\",\n",
" \"nav_eur\"\n",
" ]\n",
")\n",
"\n",
"# ============================================================\n",
"# Sanity checks (CORRECT WAY)\n",
"# ============================================================\n",
"\n",
"# Share of observations repaired\n",
"repair_share = stocks_repaired[\"repair_flag\"].mean()\n",
"\n",
"# Ensure only repaired points were modified\n",
"n_modified = stocks_repaired[\"repair_flag\"].sum()\n",
"\n",
"print(f\"Share of repaired observations: {repair_share:.4%}\")\n",
"print(f\"Number of repaired rows: {n_modified:,}\")\n",
"\n",
"stocks_repaired.to_csv('AUM_repaired.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f262605-49e8-4304-b11e-38c8bcfc6e3f",
"metadata": {},
"outputs": [],
"source": [
"print(stocks[\"Registrar Account - ID\"].nunique())\n",
"print(df[\"Registrar Account - ID\"].nunique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "37e9b599-aa51-4e03-b23c-2dd24e77fe38",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"AUM_repaired.csv\")\n",
"\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5cfb4526-7435-4e4a-ae48-0a8d40e39d81",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/55327206.py:8: DtypeWarning:\n",
"\n",
"Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n",
"/tmp/ipykernel_1311/55327206.py:9: DtypeWarning:\n",
"\n",
"Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merged dataset size: (9033269, 6)\n",
"\n",
"NUMBER OF MODIFIED OBSERVATIONS: 2263602\n",
"Share modified: 25.06 %\n",
"\n",
"NEGATIVE AUM\n",
"Before repair: 34374\n",
"After repair : 36320\n",
"\n",
"RAW AUM DISTRIBUTION\n",
"count 9.033269e+06\n",
"mean 9.106935e+03\n",
"std 1.915018e+05\n",
"min -9.918641e+06\n",
"25% 0.000000e+00\n",
"50% 0.000000e+00\n",
"75% 3.091340e+02\n",
"max 4.256300e+07\n",
"Name: Quantity - AUM_raw, dtype: float64\n",
"\n",
"REPAIRED AUM DISTRIBUTION\n",
"count 9.033269e+06\n",
"mean 9.104329e+03\n",
"std 1.914988e+05\n",
"min -9.918641e+06\n",
"25% 0.000000e+00\n",
"50% 0.000000e+00\n",
"75% 3.088430e+02\n",
"max 4.256300e+07\n",
"Name: Quantity - AUM_repaired, dtype: float64\n",
"\n",
"TOTAL AUM\n",
"Raw total : 82265397351.45718\n",
"Repaired total : 82241848877.5126\n",
"\n",
"TOP 20 AUM CHANGES\n",
" Registrar Account - ID Product - Isin Centralisation Date \\\n",
"8532368 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532369 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532370 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477988 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477987 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477986 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477989 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532371 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477994 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477996 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477997 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928641 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928642 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928643 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928644 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477995 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532359 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8713983 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8713984 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8532357 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"\n",
" Quantity - AUM_raw Quantity - AUM_repaired aum_diff \n",
"8532368 41251.971 5298781.613 5257529.642 \n",
"8532369 41251.971 5298781.613 5257529.642 \n",
"8532370 41251.971 5298781.613 5257529.642 \n",
"8477988 5298781.613 41251.971 -5257529.642 \n",
"8477987 5298781.613 41251.971 -5257529.642 \n",
"8477986 5298781.613 41251.971 -5257529.642 \n",
"8477989 5298781.613 41251.971 -5257529.642 \n",
"8532371 41251.971 5298781.613 5257529.642 \n",
"8477994 5298781.613 128141.894 -5170639.719 \n",
"8477996 5298781.613 128141.894 -5170639.719 \n",
"8477997 5298781.613 128141.894 -5170639.719 \n",
"8928641 128141.894 5298781.613 5170639.719 \n",
"8928642 128141.894 5298781.613 5170639.719 \n",
"8928643 128141.894 5298781.613 5170639.719 \n",
"8928644 128141.894 5298781.613 5170639.719 \n",
"8477995 5298781.613 128141.894 -5170639.719 \n",
"8532359 41251.971 5059704.980 5018453.009 \n",
"8713983 5059704.980 41251.971 -5018453.009 \n",
"8713984 5059704.980 41251.971 -5018453.009 \n",
"8532357 41251.971 5059704.980 5018453.009 \n",
"\n",
"ISIN WITH MOST MODIFICATIONS\n",
"Product - Isin\n",
"LU1623762769 0.535539\n",
"LU2490324410 0.525588\n",
"FR0013516044 0.524862\n",
"LU2931971050 0.500000\n",
"LU2931971217 0.500000\n",
"FR001400TVB3 0.500000\n",
"FR001400TU23 0.500000\n",
"FR00140139F6 0.500000\n",
"FR001400TVD9 0.500000\n",
"LU2931971134 0.500000\n",
"Name: aum_diff, dtype: float64\n",
"\n",
"REPAIR FLAG ERRORS: 2260454\n",
"\n",
"==============================\n",
"COMPARISON COMPLETED\n",
"==============================\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# ============================================================\n",
"# LOAD DATA\n",
"# ============================================================\n",
"\n",
"aum_raw = pd.read_csv(\"stocks.csv\") # fichier original\n",
"aum_rep = pd.read_csv(\"AUM_repaired.csv\") # fichier réparé\n",
"\n",
"aum_raw[\"Centralisation Date\"] = pd.to_datetime(aum_raw[\"Centralisation Date\"])\n",
"aum_rep[\"Centralisation Date\"] = pd.to_datetime(aum_rep[\"Centralisation Date\"])\n",
"\n",
"\n",
"# ============================================================\n",
"# KEEP SAME KEYS\n",
"# ============================================================\n",
"\n",
"keys = [\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\"\n",
"]\n",
"\n",
"aum_raw = aum_raw[keys + [\"Quantity - AUM\"]]\n",
"aum_rep = aum_rep[keys + [\"Quantity - AUM\", \"repair_flag\"]]\n",
"\n",
"\n",
"# ============================================================\n",
"# MERGE DATASETS\n",
"# ============================================================\n",
"\n",
"df = aum_raw.merge(\n",
" aum_rep,\n",
" on=keys,\n",
" how=\"inner\",\n",
" suffixes=(\"_raw\", \"_repaired\")\n",
")\n",
"\n",
"print(\"Merged dataset size:\", df.shape)\n",
"\n",
"\n",
"# ============================================================\n",
"# 1. HOW MANY VALUES CHANGED\n",
"# ============================================================\n",
"\n",
"df[\"aum_diff\"] = df[\"Quantity - AUM_repaired\"] - df[\"Quantity - AUM_raw\"]\n",
"\n",
"n_changed = (df[\"aum_diff\"] != 0).sum()\n",
"\n",
"print(\"\\nNUMBER OF MODIFIED OBSERVATIONS:\", n_changed)\n",
"print(\"Share modified:\", round(n_changed / len(df) * 100, 2), \"%\")\n",
"\n",
"\n",
"# ============================================================\n",
"# 2. NEGATIVE AUM BEFORE / AFTER\n",
"# ============================================================\n",
"\n",
"neg_before = (df[\"Quantity - AUM_raw\"] < 0).sum()\n",
"neg_after = (df[\"Quantity - AUM_repaired\"] < 0).sum()\n",
"\n",
"print(\"\\nNEGATIVE AUM\")\n",
"print(\"Before repair:\", neg_before)\n",
"print(\"After repair :\", neg_after)\n",
"\n",
"\n",
"# ============================================================\n",
"# 3. DISTRIBUTION COMPARISON\n",
"# ============================================================\n",
"\n",
"print(\"\\nRAW AUM DISTRIBUTION\")\n",
"print(df[\"Quantity - AUM_raw\"].describe())\n",
"\n",
"print(\"\\nREPAIRED AUM DISTRIBUTION\")\n",
"print(df[\"Quantity - AUM_repaired\"].describe())\n",
"\n",
"\n",
"# ============================================================\n",
"# 4. TOTAL AUM COMPARISON\n",
"# ============================================================\n",
"\n",
"print(\"\\nTOTAL AUM\")\n",
"\n",
"print(\"Raw total :\", df[\"Quantity - AUM_raw\"].sum())\n",
"print(\"Repaired total :\", df[\"Quantity - AUM_repaired\"].sum())\n",
"\n",
"\n",
"# ============================================================\n",
"# 5. LARGEST MODIFICATIONS\n",
"# ============================================================\n",
"\n",
"largest_changes = df.sort_values(\n",
" \"aum_diff\",\n",
" key=lambda x: x.abs(),\n",
" ascending=False\n",
").head(20)\n",
"\n",
"print(\"\\nTOP 20 AUM CHANGES\")\n",
"\n",
"print(\n",
" largest_changes[\n",
" [\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM_raw\",\n",
" \"Quantity - AUM_repaired\",\n",
" \"aum_diff\"\n",
" ]\n",
" ]\n",
")\n",
"\n",
"\n",
"# ============================================================\n",
"# 6. WHICH ISIN WERE MOST MODIFIED\n",
"# ============================================================\n",
"\n",
"isin_changes = (\n",
" df.groupby(\"Product - Isin\")[\"aum_diff\"]\n",
" .apply(lambda x: (x != 0).mean())\n",
" .sort_values(ascending=False)\n",
" .head(10)\n",
")\n",
"\n",
"print(\"\\nISIN WITH MOST MODIFICATIONS\")\n",
"print(isin_changes)\n",
"\n",
"\n",
"# ============================================================\n",
"# 7. CHECK REPAIR FLAG CONSISTENCY\n",
"# ============================================================\n",
"\n",
"if \"repair_flag\" in df.columns:\n",
"\n",
" repair_flag_errors = (\n",
" (df[\"repair_flag\"] == False) &\n",
" (df[\"Quantity - AUM_raw\"] != df[\"Quantity - AUM_repaired\"])\n",
" ).sum()\n",
"\n",
" print(\"\\nREPAIR FLAG ERRORS:\", repair_flag_errors)\n",
"\n",
"\n",
"print(\"\\n==============================\")\n",
"print(\"COMPARISON COMPLETED\")\n",
"print(\"==============================\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "976dd82c-5c16-44e6-aa5d-65d085714b25",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/1498669893.py:8: DtypeWarning:\n",
"\n",
"Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.graph_objects as go\n",
"\n",
"# ============================================================\n",
"# 1. LOAD DATA\n",
"# ============================================================\n",
"aum = pd.read_csv(\"AUM_repaired.csv\")\n",
"\n",
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
"aum[\"Centralisation Date\"] = pd.to_datetime(aum[\"Centralisation Date\"])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "66c011b5-aed1-428e-bd18-44d8d814c283",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hole": 0.45,
"hoverinfo": "label+percent",
"labels": [
"Clean / quasi-clean (≤1%)",
"Moderate (1– 10%)",
"High (10– 30%)",
"Severe (>30%)"
],
"textinfo": "percent",
"type": "pie",
"values": {
"bdata": "mpmZmZlZR0BmZmZmZmY7QM3MzMzMzCpAAAAAAAAAKUA=",
"dtype": "f8"
}
}
],
"layout": {
"legend": {
"orientation": "h",
"title": {
"text": "Rupture ratio"
},
"x": 0.5,
"xanchor": "center",
"y": -0.15,
"yanchor": "top"
},
"template": {
"data": {
"bar": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
2026-02-02 00:52:00 +01:00
"marker": {
2026-03-10 18:45:51 +01:00
"line": {
"color": "#E5ECF6",
"width": 0.5
},
2026-02-02 00:52:00 +01:00
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
2026-03-10 18:45:51 +01:00
"type": "bar"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"barpolar": [
2026-02-02 00:52:00 +01:00
{
"marker": {
2026-03-10 18:45:51 +01:00
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
2026-02-02 00:52:00 +01:00
}
},
2026-03-10 18:45:51 +01:00
"type": "barpolar"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"carpet": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
2026-02-02 00:52:00 +01:00
},
2026-03-10 18:45:51 +01:00
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"choropleth": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"colorbar": {
"outlinewidth": 0,
"ticks": ""
2026-02-02 00:52:00 +01:00
},
2026-03-10 18:45:51 +01:00
"type": "choropleth"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"contour": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"colorbar": {
"outlinewidth": 0,
"ticks": ""
2026-02-02 00:52:00 +01:00
},
2026-03-10 18:45:51 +01:00
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"contourcarpet": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"colorbar": {
"outlinewidth": 0,
"ticks": ""
2026-02-02 00:52:00 +01:00
},
2026-03-10 18:45:51 +01:00
"type": "contourcarpet"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"heatmap": [
2026-02-02 00:52:00 +01:00
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
2026-03-10 18:45:51 +01:00
"type": "heatmap"
2026-02-02 00:52:00 +01:00
}
],
2026-03-10 18:45:51 +01:00
"histogram": [
2026-02-02 00:52:00 +01:00
{
2026-03-10 18:45:51 +01:00
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
2026-02-02 00:52:00 +01:00
}
},
2026-03-10 18:45:51 +01:00
"type": "histogram"
2026-02-02 00:52:00 +01:00
}
2026-03-10 18:45:51 +01:00
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
2026-02-02 00:52:00 +01:00
],
2026-03-10 18:45:51 +01:00
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
2026-02-02 00:52:00 +01:00
],
2026-03-10 18:45:51 +01:00
"type": "histogram2dcontour"
}
2026-02-02 00:52:00 +01:00
],
2026-03-10 18:45:51 +01:00
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
2026-02-02 00:52:00 +01:00
}
2026-03-10 18:45:51 +01:00
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
2026-02-02 00:52:00 +01:00
}
2026-03-10 18:45:51 +01:00
],
"pie": [
{
"automargin": true,
"type": "pie"
2026-02-02 00:52:00 +01:00
}
2026-03-10 18:45:51 +01:00
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
2026-02-02 00:52:00 +01:00
}
2026-03-10 18:45:51 +01:00
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermap": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermap"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Rupture intensity distribution (AUM repaired)"
}
}
},
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzkAAAFoCAYAAAB0XzViAAAQAElEQVR4AeydB5wkRb3H/zO7ezlyEY47OHJGkCw8kghHECUKiAkQHpIEJRwIB+IRJAgoTxSeoAQfSZLkcEqSAzklHRw5X863u7c74c2vz1p652ZmZ3YndPd8+VDX3RX+9f9/q6a6/l3VvfE0/0EAAhCAAAQgAAEIQAACEIgQgbjxHwQgkIMAURCAAAQgAAEIQAACYSWAkxPWlkNvCEAAArUgQJ0QgAAEIACBEBDAyQlBI6EiBCAAAQhAAALBJoB2EIBAsAjg5ASrPdAGAhCAAAQgAAEIQAACUSFQMztwcmqGnoohAAEIQAACEIAABCAAgUoQwMmpBFVklo8AkiAAAQhAAAIQgAAEIFAiAZycEoGRHQIQgEAQCKADBCAAAQhAAAL5CeDk5GdDCgQgAAEIQAAC4SKAthCAAAQ8Ajg5Hgb+gQAEIAABCEAAAhCAQFQJ1J9dODn11+ZYDAEIQAACEIAABCAAgUgTwMmJdPOWzzgkQQACEIAABCAAAQhAICwEcHLC0lLoCQEIBJEAOkEAAhCAAAQgEEACODkBbBRUggAEIAABCISbANpDAAIQqC0BnJza8qd2CEAAAhCAAAQgAIF6IYCdVSOAk1M11FQEAQhAAAIQgAAEIAABCFSDAE5ONSiXrw4kQQACEIAABCAAAQhAAAJdEMDJ6QIQyRCAQBgIoCMEIAABCEAAAhD4ggBOzhcsOIMABCAAAQhEiwDWQAACEKhTAjg5ddrwmA0BCEAAAhCAAATqlQB2R58ATk702xgLIQABCEAAAhCAAAQgUFcEcHK61dwUggAEIAABCEAAAhCAAASCSgAnJ6gtg14QCCMBdIYABCAAAQhAAAIBIICTE4BGQAUIQAACEIg2AayDAAQgAIHqEsDJqS5vaoMABCAAAQhAAAIQWEGAfyFQMQI4ORVDi2AIQAACEIAABCAAAQhAoBYEwu3k1IIYdUIAAhCAAAQgAAEIQAACgSaAkxPo5kE5CHSPAKUgAAEIQAACEIBAPRPAyann1sd2CEAAAvVFAGshAAEIQKBOCODk1ElDYyYEIAABCEAAAhDITYBYCESPAE5O9NoUiyAAAQhAAAIQgAAEIFDXBMri5NQ1QYyHAAQgAAEIQAACEIAABAJFACcnUM2BMhEjgDkQgAAEIAABCEAAAjUggJNTA+hUCQEIQKC+CWA9BCAAAQhAoLIEcHIqyxfpEIAABCAAAQhAoDgC5IIABMpGACenbCgRBAEIQAACEIAABCAAAQiUm0B35OHkdIcaZSAAAQhAAAIQgAAEIACBwBLAyQls06BY+Qj0TNLrb31g2+97vD313LSeCSpj6Rtue9DTSbqVUWzgRc2Zt9D2Ovz0urQ98I3TDQXVj9WealcVV3/Wb80fp3gCBCAAAQhAoFQCoXFydBPUjW/jXb5n2UE3ylIN705+dwOuVn3d0VG6iU93JuSO8cSLft+dqitSJtueoOgoRpqMqU8Ua7jyqoxsKraM8rlyqlPXlQqunlz6Ka67/aqc+l75uzs8cffdNNk2Xn9N79z/j+sf0jUfL9midpC9/rLuXOU01kiW4ppbWu0HP77EG3eUprjsoN+b6iwkN7tMIK6zlBAb2SF7spKqcqk2VduqsjMuvM7EXucECEAAAhCAQKkEQuPkOMP23/Mr9vqUGzvCryefbFdcd7vlm3y4chwhAIFwE9DE+95HnrUzTjjMRgwbktOY19563xYtXmqrrzrCXn71bXOOSs7M3Yh86tlplu0caSL+pzse7YY0iuQioLZVG78wbbop5MpDHAQqTQD5EIBA+AmEzsnJRr7rDluYHJ9ck4/svPVwfdRhe3sO4K4ZLlGwN6j2TD7rGHv+gWtzriaUm7uebqsu1Vlu2cXKq3U7OEdi2y02NIV8ej/2t5dsw3XXsBOPOsA+/my2yenJl7fU+I3WW9NWX22k3XL3Y52KaiI+/e0PbfedtuwUH8aLWrezY6Y2VpDzqLZ38RwhAAEIQKCmBEJVeeidnHy0te3Cv+XE5dPT4O33Pb7T01itAinvGzM+8Pb7a7uGgraouBusnt4efdqltnhps7dypHQFlbXMf92p74OPZ3Zsg/HrpDpVt+S7IPmZarr8X/n8slRAOu51+OlWyD498T7yxMnexFBPy1290kP6SI6C5Ls0HbPTxUn1K5+C8rgg9pLhD9LNpesoPaWLyyMZkie5is+n40efzPTaTvJcWXdUOcnNleby+I/KJ11cOHXSb/zJ3rnySKZkexGZf6SjdHXldJT+mSSvv3XVf1RWMiRbZRXETHFKc7Ikz4VlzV9spVL+bJ2kn+Kyy6pN1XaqS7JUR7H6Kb8LKifdVLeCzhXn0nVU3YqfOu3Njv6uvNJL+ilPV+H9j2aaHImvbLOp9evbJ2d21auHHcqjCfLYjEMipydn5m5EDuzf13bM1K86VJcToTp2/coWtvlG67ioLo/iLvt7MgZIB3EVXwUxdUH9xq9EdrryqaxkZOfzx6t9pKf0VV6lqaz6jvqQyro8inchu/58+R568gUlrRTUxkce/DWvzdX2K2UgAgIQgAAEINAFgdA7ObrRzpw933t6O37c6C7MzZ+sp76nTrrW/nTNRG8l5MWHfutlPmHiVd6+cD1Nv/7y023QgH526rGHeHm0ba67T9dV32H/fYGddtyhniw9qVcdmkjsfvCpNnrkKl686phy16/sjvun9GhLnurz2yeZn82aZxf+6k+endoiIts1MdTKmOpV+N8rz/AmleKsiY30UFmlKUjPA446d6VtQdpCKMHKoyBmEyf/3pvsK15BEydtKXLyxHy1UcMs3178QjqOW320HbzfLpY9AVU9epqvLUxHHLCHLvMGZ6NfJ+l+5omHe85t3oKZBLWbnISjj9i3o93cVkpN+NS2XfUfOdCHHDvJ9th5qw4ZhVbklP/kn13d0Yek65abrmtyBDXxzKhV9P/F6JctTHZJ38kTj+nQV/YrTmn+/Nm6dtXW/rI6/8fLb+hg2225kXfM9Y8/j/qKWKgtS2WRS7aL029j8KAB5upSu//zlRnWVd9y5f1H/SbLMQZk/9bU7zRuZbeBfoPqIy6ordRnZYNfr1znevBx+W//z5644wqvrd24oDp2OfAU77fn5N5+3SSbmPmty7FyslTH17870dQmLp+O6621usuy0nHksKFenGPtXfAPBCAAAQhAoEgCoXdybrvnSW/ftp766elfkXavlE2Te03yNTlSomRJpraiKCiunEH16QVbTS79crUVZsN117BzTjmyI1o65ZvAd2Tq4kT1+e1zMoudBIqBnqRfft7xnd6H0ORODsR9jz7XSQNNqLT1xUV+/Ws7mH9yqImn6takR7oon5hr8rTnLlvrsuSQXYcEyHHRlhcx7coJzmejJltybiUvX3ATMf8kXA6KJnz9+/bNV6xTvOpQfpXrlJDnQvnlOPn7UL72yCOi29Fqv0t+fZu3VdSvr9pcjoDSlMdVkK2r2lq/L/WpYp7Uv/vBp17/GTl8iBPZ6ah2fnbqq50edlSCherXb1HOvuzT7/XLm63XrW2L+k26McBvjGSqvxY7BmT/1rSKpaB+Ly6SrXZR0LkLuX4vLi37qDbVb1Pt5tIkW3UozS9b/XHXzMqWY6T8uWxS/NprjtEhZ9DvVRzU9jkzEAkBCEAAAhAoQCB0To6eKG7s+8KaboB6IuifaBWwt6QkN7l978PPSyrX3cyaNGnir+02/smE5LnJ8+x5C3RZtiAHZfbchV3K05YcTTg08fBn1rXi1Q7++HznLl//fn1MqzZqT/8TX5U7dP/dvNUjnZcS5CzJafJPrjSB1kRaE+psptmyZaMcMU1ks9O6ul5rjVW91Z7Tzr+206qWJnzbbLFBV8XLli7dZYPjXDbBWYLUZ9R3tOqUlWSauCpNebLTsq+1wlNsn1Z/Ub/JlqFr187+347rm3J+NCFXvnIEOQeS8/Nf/dFbOczFQOndCeUYA9TPtcKqlVptZ3R6aDXFbTfTGKoVGK0mufRSj455LvsV5/qAs0k6Sbd
},
"metadata": {},
"output_type": "display_data"
2026-02-02 00:52:00 +01:00
}
],
"source": [
"# ============================================================\n",
2026-03-10 18:45:51 +01:00
"# 2. PREPARE FLOWS\n",
2026-02-02 00:52:00 +01:00
"# ============================================================\n",
"\n",
2026-03-10 18:45:51 +01:00
"flows_clean = (\n",
" flows\n",
" .groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# 3. MERGE\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df = aum.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# 4. SORT\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df = df.sort_values(\n",
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
")\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# REBUILD ACCOUNTING IDENTITY WITH REPAIRED AUM\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"prev_aum\"] = df.groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"prev_flow\"] = df.groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# COMPUTE GAP\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"EPS = 10\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"rupture_flag\"] = (\n",
" df[\"prev_aum\"].notna()\n",
" & (df[\"gap_abs\"] > EPS)\n",
")\n",
2026-02-02 00:52:00 +01:00
"# ============================================================\n",
2026-03-10 18:45:51 +01:00
"# 6. COMPUTE GAP\n",
2026-02-02 00:52:00 +01:00
"# ============================================================\n",
"\n",
2026-03-10 18:45:51 +01:00
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"EPS = 10\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"df[\"rupture_flag\"] = (\n",
" df[\"prev_aum\"].notna()\n",
" & (df[\"gap_abs\"] > EPS)\n",
2026-02-02 00:52:00 +01:00
")\n",
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# 7. BUILD RUPTURE SUMMARY\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"rupture_summary = (\n",
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\",\"sum\"),\n",
" total_obs=(\"rupture_flag\",\"count\"),\n",
" rupture_ratio=(\"rupture_flag\",\"mean\")\n",
" )\n",
" .reset_index()\n",
2026-02-02 00:52:00 +01:00
")\n",
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# 8. SAME CLASSIFICATION AS YOUR CODE\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"rs = rupture_summary.copy()\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (1– 10%)\",\n",
" \"High (10– 30%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"rs[\"rupture_class\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
2026-02-02 00:52:00 +01:00
")\n",
"\n",
"# ============================================================\n",
2026-03-10 18:45:51 +01:00
"# 9. DISTRIBUTION\n",
2026-02-02 00:52:00 +01:00
"# ============================================================\n",
"\n",
2026-03-10 18:45:51 +01:00
"dist = (\n",
" rs[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"# ============================================================\n",
"# 10. DONUT CHART\n",
"# ============================================================\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=dist.index,\n",
" values=dist.values,\n",
" hole=0.45,\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
" )]\n",
")\n",
2026-02-02 00:52:00 +01:00
"\n",
2026-03-10 18:45:51 +01:00
"fig.update_layout(\n",
" title=\"Rupture intensity distribution (AUM repaired)\",\n",
" legend=dict(\n",
" orientation=\"h\",\n",
" yanchor=\"top\",\n",
" y=-0.15,\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()"
2026-02-02 00:52:00 +01:00
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": null,
2026-02-02 00:52:00 +01:00
"id": "990898ea-ceca-46bb-bfb3-c87bf289d272",
"metadata": {},
2026-03-10 18:45:51 +01:00
"outputs": [],
2026-02-02 00:52:00 +01:00
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"df = merged_isin.copy()\n",
"\n",
"# Ajouter année / mois\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# 1. Nombre total de lignes par mois\n",
"total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
"\n",
"# 2. Nombre de ruptures par mois\n",
"ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
"\n",
"# 3. Merge pour obtenir total + ruptures\n",
"ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
"ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
"\n",
"# 4. Proportion (en %)\n",
"ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
"\n",
"# 5. Pivot pour heatmap\n",
"heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
"\n",
"# 6. Plot\n",
"plt.figure(figsize=(14, 7))\n",
"sns.heatmap(\n",
" heatmap_ratio, \n",
" cmap=\"Reds\",\n",
" linewidths=.3,\n",
" linecolor=\"grey\",\n",
" annot=True,\n",
" fmt=\".2%\",\n",
" cbar_kws={'label': 'Proportion de ruptures'}\n",
")\n",
"\n",
"plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
"plt.xlabel(\"Mois\")\n",
"plt.ylabel(\"Année\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": null,
2026-02-02 00:52:00 +01:00
"id": "4d335589-c519-458d-857d-a051813b950b",
"metadata": {},
2026-03-10 18:45:51 +01:00
"outputs": [],
2026-02-02 00:52:00 +01:00
"source": [
"df = merged_isin.copy()\n",
"\n",
"# Ajouter year / month au cas où\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# Merge géographique\n",
"df = df.merge(\n",
" geo[[\"Registrar Account - ID\", \"country\"]],\n",
" on=\"Registrar Account - ID\",\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
"\n",
"# Total des lignes par pays\n",
"total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
"\n",
"# Nombre de ruptures\n",
"rupt_country = (\n",
" df[df[\"rupture_flag\"]]\n",
" .groupby(\"country\")\n",
" .size()\n",
" .reset_index(name=\"ruptures\")\n",
")\n",
"\n",
"# Merge + ratios\n",
"country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
"country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
"country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
"\n",
"# Tri (rupture ratio décroissant)\n",
"country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
]
},
{
"cell_type": "code",
2026-03-10 18:45:51 +01:00
"execution_count": null,
2026-02-02 00:52:00 +01:00
"id": "8a45a111-25da-4f5c-9723-c3efd25c906d",
"metadata": {},
2026-03-10 18:45:51 +01:00
"outputs": [],
2026-02-02 00:52:00 +01:00
"source": [
"# On ajoute une colonne en % pour l’ affichage\n",
"import plotly.express as px\n",
"\n",
"country_stats_plot = country_stats.copy()\n",
"country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
"\n",
"# Tri décroissant par proportion de ruptures\n",
"country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
"\n",
"fig = px.bar(\n",
" country_stats_plot,\n",
" x=\"country\",\n",
" y=\"rupture_ratio\",\n",
" hover_data={\n",
" \"rupture_pct\": ':.2f',\n",
" \"ruptures\": True,\n",
" \"total_obs\": True,\n",
" \"rupture_ratio\": False, # on cache la version décimale\n",
" },\n",
" labels={\n",
" \"country\": \"Pays\",\n",
" \"rupture_ratio\": \"Proportion de ruptures\",\n",
" \"rupture_pct\": \"% de ruptures\",\n",
" \"ruptures\": \"Nb de ruptures\",\n",
" \"total_obs\": \"Nb d'observations\"\n",
" },\n",
" title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
")\n",
"\n",
"# Format en %\n",
"fig.update_yaxes(tickformat=\".1%\")\n",
"\n",
"fig.update_layout(\n",
" xaxis_tickangle=-45,\n",
" bargap=0.2\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4af9841-6cf9-4d27-8096-ac878e866bc6",
"metadata": {},
"outputs": [],
"source": [
"rs = rupture_summary.copy()\n",
"\n",
"# 1. Stats numériques classiques\n",
"print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
"print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
"\n",
"\n",
"# 2. Distribution par classes (bins)\n",
"\n",
"rs[\"rupture_bucket\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
" labels=[\n",
" \"0– 0.1%\",\n",
" \"0.1– 1%\",\n",
" \"1– 5%\",\n",
" \"5– 10%\",\n",
" \"10– 25%\",\n",
" \"25– 50%\",\n",
" \"50– 100%\"\n",
" ],\n",
" include_lowest=True\n",
")\n",
"\n",
"# Ajouter la catégorie \"0%\"\n",
"rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
"\n",
"# Remplacer les 0% exacts\n",
"rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
"\n",
"bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
"print(bucket_counts)\n",
"\n",
"\n",
"# 3. Pourcentages\n",
"bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
"\n",
"print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
"print(bucket_percent)\n",
"\n",
"\n",
"# 4. Nombre de comptes totalement propres\n",
"no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
"print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
"\n",
"# 5. Comptes extrêmement problématiques\n",
"severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
"print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
"\n",
"medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
"print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f39a9a5a-5f4e-4cac-9f63-e6952582b6ff",
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"fig = px.histogram(\n",
" rs,\n",
" x=\"rupture_ratio\",\n",
" nbins=50,\n",
" title=\"Distribution du rupture_ratio\",\n",
" labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
")\n",
"fig.update_layout(bargap=0.05)\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70132995-8379-44b6-8ff6-f09524c4e4d0",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. Filtres de base ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# Filtrer uniquement l'année 2021\n",
"ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
"\n",
"print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
"\n",
"# --- 2. Classification du type de gap ---\n",
"ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
"\n",
"# --- 3. Statistiques globales ---\n",
"gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
"gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
"\n",
"print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
"print(gap_counts)\n",
"print(\"\\n(%)\")\n",
"print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
"\n",
"# --- 4. Intensité des écarts ---\n",
"intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
"print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
"print(intensity_stats)\n",
"\n",
"# --- 5. Visualisation rapide ---\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(10,5))\n",
"sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
"plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
"plt.title(\"Distribution des gaps de rupture en 2021\")\n",
"plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1faf943a-4703-4b19-a867-2670ac3a5209",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. ADD YEAR ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# --- 2. DEFINE PERIODS ---\n",
"conditions = [\n",
" merged[\"year\"] < 2021,\n",
" merged[\"year\"] == 2021,\n",
" merged[\"year\"] > 2021\n",
"]\n",
"\n",
"period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
"\n",
"merged[\"period\"] = np.select(\n",
" conditions,\n",
" period_labels,\n",
" default=\"unknown\"\n",
")\n",
"\n",
"# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
"merged[\"gap_type\"] = np.where(\n",
" merged[\"gap\"] > 0, \"positive\",\n",
" np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 4. TOTAL OBS PER PERIOD ---\n",
"total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
"\n",
"# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
"rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
"\n",
"# --- 6. PROPORTION OF RUPTURES ---\n",
"rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
"\n",
"# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
"gap_dist = (\n",
" ruptures.groupby([\"period\", \"gap_type\"])\n",
" .size()\n",
" .groupby(level=0)\n",
" .apply(lambda x: (x / x.sum()) * 100) # % par période\n",
")\n",
"\n",
"\n",
"# --- 8. MERGE AND DISPLAY ---\n",
"summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
"summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
"\n",
"print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
"print(summary)\n",
"\n",
"print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
"print(gap_dist)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5abee764-b890-4ea1-8f98-5a0ff1512611",
"metadata": {},
"outputs": [],
"source": [
"from plotly.subplots import make_subplots\n",
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. DEFINE PERIODS ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 2. Ensure gap_type exists + no missing categories ---\n",
"ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"}) # zero is equivalent to no-flow change\n",
"\n",
"# --- 3. Compute gap counts ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 4. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 5. MAKE TWO DONUT CHARTS ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
" showlegend=True\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3aa3b8a0-f499-495a-9171-2e09d0bb1e5f",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Compute gap counts by period ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 2. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before 2021\", \"After 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4f0dc74-649d-4105-9a1a-44a18d126a3c",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Define periods ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"# --- 2. Keep only ruptures ---\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 3. Count ruptures per period ---\n",
"rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
" [\"Before Sep 2021\", \"After Sep 2021\"]\n",
").fillna(0)\n",
"\n",
"# --- 4. Pie chart ---\n",
"fig = go.Figure(data=[\n",
" go.Pie(\n",
" labels=rupture_counts.index,\n",
" values=rupture_counts.values,\n",
" hole=0.45,\n",
" marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
" textinfo=\"percent+value\",\n",
" )\n",
"])\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecccd73c-00a6-4ff3-b213-e85b98ec5a55",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# 1. Filtre sur la période post-Sept 2021\n",
"cutoff = pd.Timestamp(\"2021-09-01\")\n",
"post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
"\n",
"# 2. On ne garde que les ruptures\n",
"post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
"\n",
"# 3. Gap absolu + gap relatif (% du stock)\n",
"post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
"post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
"\n",
"# 4. Percentiles globaux\n",
"p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
"p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
"p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
"\n",
"# 5. Classification automatique\n",
"def classify_gap(gap, gap_rel, acct):\n",
" # RESET → énorme choc (technique)\n",
" if gap_abs >= p99 or gap_rel >= 0.90:\n",
" return \"reset\"\n",
"\n",
" # SPIKE → très gros gap mais isolé\n",
" if gap_abs >= p95:\n",
" return \"spike\"\n",
"\n",
" # SHIFT → décalage permanent\n",
" # Test : moyenne des gaps du compte\n",
" return None\n",
"\n",
"# Calcul du shift (décalage directionnel)\n",
"shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
"\n",
"post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"post_rupt[\"gap_type2\"] = np.where(\n",
" post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
" np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
" np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
" \n",
"# 6. Statistiques globales\n",
"stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
"print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
"print(stats)\n",
"\n",
"# 7. Stats par client\n",
"client_stats = (\n",
" post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"# 8. Stats par ISIN\n",
"isin_stats = (\n",
" post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
"print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
"\n",
"print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
"print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2efc5e0-bc35-4fa7-ab5d-6be616964446",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- Data from your output ---\n",
"labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
"values = [50.4, 44.6, 4.0, 1.0]\n",
"\n",
"# --- Pie chart ---\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=labels,\n",
" values=values,\n",
" hole=0.35, # donut style (plus lisible)\n",
" textinfo='percent',\n",
" marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Typologie des ruptures depuis Septembre 2021\",\n",
" legend_title=\"Type de gap\",\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "744e04b6-3f34-40c9-95fe-a5605e7c7f02",
"metadata": {},
"outputs": [],
"source": [
"merged[\"gap_abs\"] = merged[\"gap\"].abs()\n",
"\n",
"merged[\"gap_rel\"] = (\n",
" merged[\"gap_abs\"] /\n",
" merged[\"Quantity - AUM\"].replace(0, np.nan)\n",
")\n",
"\n",
"merged.loc[merged[\"rupture_flag\"], \"gap_rel\"].describe(\n",
" percentiles=[0.5, 0.75, 0.9, 0.95, 0.99]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d20625e-1045-4b7a-ab64-3381997e4131",
"metadata": {},
"outputs": [],
"source": [
"# uniquement sur les ruptures\n",
"df_r = merged[merged[\"rupture_flag\"]].copy()\n",
"\n",
"# seuils globaux (descriptifs, pas \"optimisés\")\n",
"q90 = df_r[\"gap_abs\"].quantile(0.90)\n",
"q99 = df_r[\"gap_abs\"].quantile(0.99)\n",
"\n",
"# moyenne directionnelle par compte\n",
"avg_gap_by_account = (\n",
" df_r.groupby(\"Registrar Account - ID\")[\"gap\"]\n",
" .mean()\n",
" .rename(\"avg_gap\")\n",
")\n",
"\n",
"df_r = df_r.merge(avg_gap_by_account, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"def classify_gap(row):\n",
" if row[\"gap_abs\"] >= q99:\n",
" return \"reset\"\n",
" if row[\"gap_abs\"] >= q90:\n",
" return \"spike\"\n",
" if abs(row[\"avg_gap\"]) > row[\"gap_abs\"]:\n",
" return \"shift\"\n",
" return \"micro\"\n",
"\n",
"df_r[\"discontinuity_type\"] = df_r.apply(classify_gap, axis=1)\n",
"df_r[\"discontinuity_type\"].value_counts(normalize=True) * 100\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02806629-e454-4e10-82be-6e2239091088",
"metadata": {},
"outputs": [],
"source": [
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"yearly_stats = merged.groupby(\"year\").agg(\n",
" total_obs=(\"gap\", \"count\"),\n",
" ruptures=(\"rupture_flag\", \"sum\")\n",
").reset_index()\n",
"\n",
"yearly_stats[\"rupture_rate\"] = (\n",
" yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2edf2c55-45e7-4aad-b4f9-5c35178abad6",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"df_r = merged[merged[\"rupture_flag\"]].copy()\n",
"\n",
"plt.figure(figsize=(12,4))\n",
"plt.hist(df_r[\"gap_abs\"], bins=100, log=True)\n",
"plt.title(\"Distribution of absolute gaps (log scale)\")\n",
"plt.xlabel(\"Absolute gap\")\n",
"plt.ylabel(\"Frequency (log)\")\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(12,4))\n",
"plt.hist(df_r[\"gap_rel\"].dropna(), bins=100, log=True)\n",
"plt.title(\"Distribution of relative gaps (|gap| / AUM)\")\n",
"plt.xlabel(\"Relative gap\")\n",
"plt.ylabel(\"Frequency (log)\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "981f2ec6-574b-41ea-b4bf-45be54aeda1f",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10,4))\n",
"plt.plot(yearly_stats[\"year\"], yearly_stats[\"rupture_rate\"], marker=\"o\")\n",
"plt.title(\"Evolution of AUM– Flow inconsistency rate over time\")\n",
"plt.xlabel(\"Year\")\n",
"plt.ylabel(\"Rupture rate\")\n",
"plt.grid(True)\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}