Project_Carmignac/brouillon/analyse_rupture.ipynb

4473 lines
317 KiB
Plaintext
Raw Normal View History

2026-04-05 17:52:42 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"id": "338730e2-a6de-4d4f-b438-efe3feb139ab",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.graph_objects as go\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "cfd11919-0941-400e-a516-72871881f733",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/1940519970.py:1: DtypeWarning:\n",
"\n",
"Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n",
"/tmp/ipykernel_1311/1940519970.py:2: DtypeWarning:\n",
"\n",
"Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
}
],
"source": [
"stocks=pd.read_csv('stocks.csv')\n",
"flows = pd.read_csv('flows.csv')"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "b99e3402-fe26-4f4e-8c1c-5f07847bce94",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/3613746644.py:1: DtypeWarning:\n",
"\n",
"Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
}
],
"source": [
"merged = pd.read_csv('merged.csv')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "34e5a815-7269-4312-bfe6-e2cd12595e57",
"metadata": {},
"outputs": [],
"source": [
"# 1. Prepare stock dataset ISIN-by-ISIN\n",
"stocks_isin = stocks[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
"\n",
"stocks_isin = stocks_isin.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"# 2. Prepare flows dataset ISIN-by-ISIN\n",
"flows_isin = flows[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
"\n",
"flows_isin = (\n",
" flows_isin\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
" .reset_index()\n",
")\n",
"\n",
"# 3. Merge stocks & flows ISIN-by-ISIN\n",
"merged_isin = stocks_isin.merge(\n",
" flows_isin,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# 4. Compute expected stock per ISIN for each account\n",
"merged_isin[\"prev_stock\"] = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - AUM\"]\n",
" .shift(1)\n",
")\n",
"\n",
"merged_isin[\"prev_netflows\"] = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - NetFlows\"]\n",
" .shift(1)\n",
" .fillna(0)\n",
")\n",
"\n",
"merged_isin[\"expected_stock\"] = (\n",
" merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
")\n",
"\n",
"# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
"TOL = 1e-6\n",
"\n",
"merged_isin[\"gap\"] = (\n",
" merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
")\n",
"\n",
"merged_isin[\"rupture_flag\"] = (\n",
" merged_isin[\"prev_stock\"].notna()\n",
" & (merged_isin[\"gap\"].abs() > TOL)\n",
")\n",
"\n",
"# 6. Summarize ruptures per (Account, ISIN)\n",
"rupture_isin_summary = (\n",
" merged_isin\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# Sort by worst ISIN trajectories\n",
"rupture_isin_summary = rupture_isin_summary.sort_values(\n",
" \"rupture_ratio\",\n",
" ascending=False\n",
")"
]
},
{
"cell_type": "markdown",
"id": "16213cb2-07d8-4e82-b9bb-252554ec47b9",
"metadata": {},
"source": [
"# Détection des ruptures"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "78c3db70-e0b6-4de2-92ca-e29cf5bf6bd1",
"metadata": {},
"outputs": [],
"source": [
"# ============================================================\n",
"# AUMFLOW CONSISTENCY & RUPTURE DETECTION (FINAL VERSION)\n",
"# ============================================================\n",
"# ------------------------------------------------------------\n",
"# 1. Keep relevant columns\n",
"# ------------------------------------------------------------\n",
"stocks_clean = stocks[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"flows_clean = flows[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Date formatting\n",
"# ------------------------------------------------------------\n",
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Aggregate flows per day\n",
"# ------------------------------------------------------------\n",
"flows_clean = (\n",
" flows_clean\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 4. Merge stocks and flows\n",
"# ------------------------------------------------------------\n",
"df = stocks_clean.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 5. Sort and compute expected stock\n",
"# ------------------------------------------------------------\n",
"df = df.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"prev_stock\"] = df.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df[\"prev_flows\"] = df.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 6. Compute gaps\n",
"# ------------------------------------------------------------\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"df[\"gap_rel\"] = df[\"gap_abs\"] / df[\"expected_stock\"].abs().clip(lower=1)\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 7. Detect ruptures (economic rule)\n",
"# ------------------------------------------------------------\n",
"TAU_ABS = 10.0 # minimum absolute gap (shares)\n",
"TAU_REL = 0.005 # minimum relative gap (0.5%)\n",
"\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_stock\"].notna()\n",
" & (df[\"gap_abs\"] > TAU_ABS)\n",
" & (df[\"gap_rel\"] > TAU_REL)\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 8. Remove end-of-sample false positives (edge effects)\n",
"# ------------------------------------------------------------\n",
"last_date = df[\"Centralisation Date\"].max()\n",
"\n",
"df[\"rupture_flag\"] = np.where(\n",
" (df[\"rupture_flag\"]) & (df[\"Centralisation Date\"] == last_date),\n",
" False,\n",
" df[\"rupture_flag\"]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "a9783dc1-e225-4142-8b6f-6f9e620b4b3d",
"metadata": {},
"outputs": [],
"source": [
"# ------------------------------------------------------------\n",
"# 9. ISIN-level summary (AFTER CLEANING)\n",
"# ------------------------------------------------------------\n",
"rupture_isin_summary = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap_abs\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 10. Account-level summary (AFTER CLEANING)\n",
"# ------------------------------------------------------------\n",
"rupture_summary = (\n",
" df.groupby(\"Registrar Account - ID\")\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap_abs\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"\n",
"# ------------------------------------------------------------\n",
"# 11. Outputs\n",
"# ------------------------------------------------------------\n",
"df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
"rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
"rupture_summary.to_csv(\"rupture_summary.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "f5b62558-c27a-4428-a193-8b97e0ce6b6a",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hole": 0.45,
"hoverinfo": "label+percent",
"labels": [
"Clean / quasi-clean (≤1%)",
"Moderate (110%)",
"High (1030%)",
"Severe (>30%)"
],
"textinfo": "percent",
"type": "pie",
"values": {
"bdata": "AAAAAACASEAAAAAAAIBBQAAAAAAAAChAZmZmZmZmEEA=",
"dtype": "f8"
}
}
],
"layout": {
"legend": {
"orientation": "h",
"title": {
"text": "Rupture ratio"
},
"x": 0.5,
"xanchor": "center",
"y": -0.15,
"yanchor": "top"
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermap": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermap"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
}
}
},
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAFoCAYAAADQNY2xAAAQAElEQVR4AezdB5xU1d3/8d+UBZalCCigYAF779gVe3lsUSGxRRProz722BOJGiyJJhpji0nMo8a/GhM1Ro2aaB5jNxgL9oZSBAFBlt1ld2fmP98Ld7077OzO7LRbPrw4O3fuPefcc97nzsz93TITz/APAQQQQAABBBBAAAEEECizQNz4hwACPhOgOQgggAACCCCAQPAFCDSCP4b0AAEEEECg0gLUjwACCCBQtACBRtFkFEAAAQQQQAABBBCotQDr978AgYb/x4gWIoAAAggggAACCCAQOAECjcANWakNpjwCCCCAAAIIIIAAApUXINCovDFrQAABBLoXYCkCCCCAAAIhFCDQCOGg0iUEEEAAAQQQKE2A0gggULoAgUbphtSAAAIIIIAAAggggAACOQJlDjRyaucpAggggAACCCCAAAIIRFKAQCOSw06nIyVAZxFAAAEEEEAAgRoIEGjUAJ1VIoAAAghEW4DeI4AAAlEQINCIwijTRwQQQAABBBBAAIHuBFhWAQECjQqgUiUCCCCAAAIIIIAAAlEXINCI+hZQav8pjwACCCCAAAIIIIBAFwIEGl2gMAsBBBAIsgBtRwABBBBAwA8CBBp+GAXagAACCCCAAAJhFqBvCERSgEAjksNOpxFAAAEEEEAAAQQQqKyAvwONyvad2hFAAAEEEEAAAQQQQKBCAgQaFYKlWgTCKkC/EEAAAQQQQACBQgQINApRIg8CCCCAAAL+FaBlCCCAgC8FCDR8OSw0CgEEEEAAAQQQQCC4ArRcAgQaUiAhgAACCCCAAAIIIIBAWQUINMrKSWWlClAeAQQQQAABBBBAIBwCBBrhGEd6gQACCFRKgHoRQAABBBDolQCBRq/YKIQAAggggAACCNRKgPUiEAwBAo1gjBOtRAABBBBAAAEEEEAgUAKRCjQCNTI0FgEEEEAAAQQQQACBAAsQaAR48Gg6AiEQoAsIIIAAAgggEFIBAo2QDizdQgABBBBAoHcClEIAAQTKI0CgUR5HakEAAQQQQAABBBBAoDICAa2VQCOgA0ezEUAAAQQQQAABBBDwswCBhp9Hh7aVKkB5BBBAAAEEEEAAgRoJEGjUCJ7VIoAAAtEUoNcIIIAAAlERINCIykjTTwQQQAABBBBAoCsB5iFQIQECjQrBUi0CCCCAAAIIIIAAAlEWINDo/ehTEgEEEEAAAQQQQAABBPIIEGjkgWE2AggEUYA2I4AAAggggIBfBAg0/DIStAMBBBBAAIEwCtAnBBCIrACBRmSHno4jgAACCCCAAAIIRFGgWn0m0KiWNOtBAAEEEEAAAQQQQCBCAgQaERpsulqqAOURQAABBBBAAAEEChUg0ChUinwIIIAAAv4ToEUIIIAAAr4VINDw7dDQMAQQQAABBBBAIHgCtBgBV4BAw5XgEQEEEEAAAQQQQAABBMomQKBRNspSK6I8AggggAACCCCAAALhESDQCM9Y0hMEECi3APUhgAACCCCAQK8FCDR6TUdBBBBAAAEEEKi2AOtDAIHgCBBoBGesaCkCCCCAAAIIIIAAAn4TyNseAo28NCxAAAEEEEAAAQQQQACB3goQaPRWjnIIlCpAeQQQQAABBBBAIMQCBBohHly6hgACCCBQnAC5EUAAAQTKJ0CgUT5LakIAAQQQQAABBBAorwC1BViAQCPAg0fTEUAAAQQQQAABBBDwqwCBhl9HptR2UR4BBBBAAAEEEEAAgRoKEGjUEJ9VI4BAtAToLQIIIIAAAlESINCI0mjTVwQQQAABBBDwCjCNAAIVFCDQqCAuVSOAAAIIIIAAAgggEFWB3gUaUdWi3wgggAACCCCAAAIIIFCQAIFGQUxkQsD/ArQQAQQQQAABBBDwkwCBhp9Gg7YggAACCIRJgL4ggAACkRYg0Ij08NN5BBBAAAEEEEAgSgL0tZoCBBrV1GZdCCCAAAIIIIAAAghERIBAIyIDXWo3KY8AAggggAACCCCAQDECBBrFaJEXAQQQ8I8ALUEAAQQQQMDXAgQavh4eGocAAggggAACwRGgpQgg4BUg0PBqMI0AAggggAACCCCAAAJlEfBFoFGWnlAJAggggAACCCCAAAII+EaAQMM3Q0FDEPCVAI1BAAEEEEAAAQRKEiDQKImPwggggAACCFRLgPUggAACwRIg0AjWeNFaBBBAAAEEEEAAAb8I0I5uBQg0uuVhIQIIIIAAAggggAACCPRGgECjN2qUKVWA8ggggAACCCCAAAIhFyDQCPkA0z0EEECgMAFyIYAAAgggUF4BAo3yelIbAggggAACCCBQHgFqQSDgAgQaAR9Amo8AAggggAACCCCAgB8Fwhho+NGZNiGAAAIIIIAAAgggECkBAo1IDTedRaBWAqwXAQQQQAABBKImQKARtRGnvwgggAACCEiAhAACCFRYgECjwsBUjwACCCCAAAIIIIBAIQJhy0OgEbYRpT8IIIAAAggggAACCPhAgEDDB4NAE0oVoDwCCCCAAAIIIICA3wQINPw2IrQHAQQQCIMAfUAAAQQQiLwAgUbkNwEAEEAAAQQQQCAKAvQRgWoLEGhUW5z1IYAAAggggAACCCAQAQECjR4HmQwIIIAAAggggAACCCBQrACBRrFi5EcAgdoL0AIEEEAAAQQQ8L0AgYbvh4gGIoAAAggg4H8BWogAAgjkChBo5IrwHAEEEEAAAQQQQACB4AvUvAcEGjUfAhqAAAIIIIAAAggggED4BAg0wjem9KhUAcojgAACCCCAAAIIlCxAoFEyIRUggAACCFRagPoRQAABBIInQKARvDGjxQgggAACCCCAQK0FWD8CPQoQaPRIRAYEEEAAAQQQQAABBBAoVoBAo1ixUvNTHgEEEEAAAQQQQACBCAgQaERgkOkiAgh0L8BSBBBAAAEEECi/AIFG+U2pEQEEEEAAAQRKE6A0AgiEQIBAIwSDSBcQQAABBBBAAAEEEKisQPG1E2gUb0YJBBBAAAEEEEAAAQQQ6EGAQKMHIBYjUKoA5RFAAAEEEEAAgSgKEGhEcdTpMwIIIJAVSM+Zae2vv2xtz//dWp940FoevNOa777Zmm77qS3+xY+s8Sdn26KLT7KvzzzCFp5wgC2YuKMdf0abnXVxm110RZtd/rN2u/ZX7XbTb9vtjj+k7N4HU/bXJ9L2witpe/+jjM2bn12JP//TKgQQQACBKggQaFQBmVUggAACNRdobrL2N16xlgfusMVXnWcLj9/fvj5tohNMLP7FpdZ0+8+s5Q+32pKH7rbWpx7KBh//cIKQ1AdTLT3zM8ss/KqjC4sazeZ8aTbt84y9837GpryesX+9lLYnn07bn/+ast/clbJrbmi383/c5gQm501qs6uvb7fb70zZQ4+l7M23M9ba2lEdEwgggICZgRBGAQKNMI4qfUIAgWgLpFKW+uhdW/K3P1nTjVc4ZyQWHLOXNV5xlrXce7u1TXneMl8vqJrR/GyM8sHHGXvx1bT95fG0XX9ru516XptdcW273f9Qyt6YmrGWJVVrDitCAAEEEKiSAIFGlaArtRrqRQABBCSgwGHJo/dZ46Wn2oLDd7FFFx5vzb+5zlr/73HnjITy+CllMmaffpaxv/0jbTfc1m6nZQOPy65ZGngoKPFTW2kLAggggEDvBAg0eudGKQQQQCCfQPXmtzRnA4m/WePkc2zhiQdZ8x03WPs7r1dv/WVe02czlgYeuszq/Elt9seHUzZ9ZjYiKfN6qA4BBBBAoDoCBBrVcWYtCCCAQHkEUinn0qfF10+yBcfvb003Xm7t/3nJLJ0qT/0+qWXeV2aP/z1tk65utx9ObrNH/pa2ufN80jiaEUABmowAArUQINCohTrrRAABBIoUSL33pjXffm32zMWBzs3cbc89ZdYajRsbZs02e/DRlF1wWZv95Np251utiuQjOwIIIIBADQS6DTRq0B5WiQACCCDgEWh75VlbdN6xtuiH/21LnvizZRYt9CyN3uQnn2Wcb7W64Mdt9o9n09baFj0DeowAAggERYBAIygjRTsRWCrA34gItE9
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Base\n",
"rs = rupture_summary.copy()\n",
"\n",
"# Classes simplifiées\n",
"bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (110%)\",\n",
" \"High (1030%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
"\n",
"rs[\"rupture_class\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"# Distribution en %\n",
"dist = (\n",
" rs[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"# Donut chart\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=dist.index,\n",
" values=dist.values,\n",
" hole=0.45,\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" legend=dict(\n",
" orientation=\"h\", # horizontale\n",
" yanchor=\"top\",\n",
" y=-0.15, # en dessous du graphe\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "markdown",
"id": "e52cd650-df05-490d-af59-e66c058f955d",
"metadata": {},
"source": [
"## AUMFLOW CONSISTENCY & DISCONTINUITY DETECTION"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "a7efe494-f5fa-43f8-8446-942fc2d3bd4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Detection threshold epsilon (trimmed 99th percentile): 40.03%\n"
]
}
],
"source": [
"# ------------------------------------------------------------\n",
"# 1. Keep relevant columns\n",
"# ------------------------------------------------------------\n",
"stocks_clean = stocks[\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - AUM\"]\n",
"].copy()\n",
"\n",
"flows_clean = flows[\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - NetFlows\"]\n",
"].copy()\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Date formatting\n",
"# ------------------------------------------------------------\n",
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Aggregate flows per day\n",
"# ------------------------------------------------------------\n",
"flows_clean = (\n",
" flows_clean\n",
" .groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 4. Merge stocks and flows\n",
"# ------------------------------------------------------------\n",
"df = stocks_clean.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# ------------------------------------------------------------\n",
"# 5. Sort and reconstruct expected stock\n",
"# ------------------------------------------------------------\n",
"df = df.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"prev_stock\"] = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" [\"Quantity - AUM\"]\n",
" .shift(1)\n",
")\n",
"\n",
"df[\"prev_flows\"] = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" [\"Quantity - NetFlows\"]\n",
" .shift(1)\n",
" .fillna(0)\n",
")\n",
"\n",
"df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
"\n",
"# ------------------------------------------------------------\n",
"# 6. Compute accounting gaps\n",
"# ------------------------------------------------------------\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"\n",
"# Relative gap normalised by previous stock\n",
"df[\"gap_rel\"] = (\n",
" df[\"gap_abs\"] /\n",
" df[\"prev_stock\"].abs().replace(0, np.nan)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 7. Calibration sample (valid regime)\n",
"# ------------------------------------------------------------\n",
"valid_gaps = df.loc[\n",
" df[\"gap_rel\"].notna() & (df[\"prev_stock\"] > 0),\n",
" \"gap_rel\"\n",
"]\n",
"\n",
"# ------------------------------------------------------------\n",
"# 8. Robust, data-driven threshold (epsilon)\n",
"# ------------------------------------------------------------\n",
"# Step 1 — trim extreme breaks to avoid calibrating on resets\n",
"gap_rel_trimmed = valid_gaps[\n",
" valid_gaps <= valid_gaps.quantile(0.90)\n",
"]\n",
"\n",
"# Step 2 — define epsilon on the upper tail of the trimmed distribution\n",
"EPSILON = gap_rel_trimmed.quantile(0.99)\n",
"\n",
"# ------------------------------------------------------------\n",
"# 9. Detect discontinuities (diagnostic rule)\n",
"# ------------------------------------------------------------\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_stock\"].notna()\n",
" & (df[\"prev_stock\"] > 0)\n",
" & (df[\"gap_rel\"] > EPSILON)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 10. Remove end-of-sample edge effects\n",
"# ------------------------------------------------------------\n",
"last_date = df[\"Centralisation Date\"].max()\n",
"\n",
"df.loc[\n",
" (df[\"rupture_flag\"]) &\n",
" (df[\"Centralisation Date\"] == last_date),\n",
" \"rupture_flag\"\n",
"] = False\n",
"\n",
"# ------------------------------------------------------------\n",
"# 11. ISIN-level summary\n",
"# ------------------------------------------------------------\n",
"rupture_isin_summary = (\n",
" df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap_abs=(\"gap_abs\", \"max\"),\n",
" max_gap_rel=(\"gap_rel\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 12. Account-level summary\n",
"# ------------------------------------------------------------\n",
"rupture_summary = (\n",
" df.groupby(\"Registrar Account - ID\")\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap_abs=(\"gap_abs\", \"max\"),\n",
" max_gap_rel=(\"gap_rel\", \"max\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 13. Outputs\n",
"# ------------------------------------------------------------\n",
"df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
"rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
"rupture_summary.to_csv(\"rupture_summary.csv\", index=False)\n",
"\n",
"print(f\"Detection threshold epsilon (trimmed 99th percentile): {EPSILON:.2%}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "d7454212-1493-4715-a436-c331931f92fa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Registrar Account - ID</th>\n",
" <th>Product - Isin</th>\n",
" <th>n_ruptures</th>\n",
" <th>total_obs</th>\n",
" <th>rupture_ratio</th>\n",
" <th>max_gap_abs</th>\n",
" <th>max_gap_rel</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>59545</th>\n",
" <td>200127410</td>\n",
" <td>FR0010135103</td>\n",
" <td>384</td>\n",
" <td>436</td>\n",
" <td>0.880734</td>\n",
" <td>295985.42</td>\n",
" <td>3371.158214</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Registrar Account - ID Product - Isin n_ruptures total_obs \\\n",
"59545 200127410 FR0010135103 384 436 \n",
"\n",
" rupture_ratio max_gap_abs max_gap_rel \n",
"59545 0.880734 295985.42 3371.158214 "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rupture_isin_summary.sort_values(\"rupture_ratio\").head(1)\n",
"rupture_isin_summary.sort_values(\"rupture_ratio\", ascending=False).head(1)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "b4040847-e0cf-4aa5-966c-d1fbf3935b7d",
"metadata": {},
"outputs": [],
"source": [
"def plot_isin_evolution(df, account_id, isin, title_suffix=\"\"):\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == account_id) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].copy()\n",
"\n",
" if sub.empty:\n",
" print(\"No data for this (account, ISIN).\")\n",
" return\n",
"\n",
" plt.figure(figsize=(10,4))\n",
"\n",
" # Stock observé\n",
" plt.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"Quantity - AUM\"],\n",
" label=\"Observed stock\",\n",
" linewidth=2\n",
" )\n",
"\n",
" # Stock attendu\n",
" plt.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"expected_stock\"],\n",
" label=\"Expected stock\",\n",
" linestyle=\"--\"\n",
" )\n",
"\n",
" # Ruptures\n",
" rupt = sub[sub[\"rupture_flag\"]]\n",
" plt.scatter(\n",
" rupt[\"Centralisation Date\"],\n",
" rupt[\"Quantity - AUM\"],\n",
" color=\"red\",\n",
" label=\"Rupture\",\n",
" zorder=5\n",
" )\n",
"\n",
" plt.title(f\"ISIN {isin} — Account {account_id} {title_suffix}\")\n",
" plt.xlabel(\"Date\")\n",
" plt.ylabel(\"AUM (shares)\")\n",
" plt.legend()\n",
" plt.grid(True)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "e5d7a5ab-40bd-452d-a6ae-d56e220c592f",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'plot_isin_dynamics' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[61]\u001b[39m\u001b[32m, line 63\u001b[39m\n\u001b[32m 58\u001b[39m plt.show()\n\u001b[32m 62\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m _, row \u001b[38;5;129;01min\u001b[39;00m sample_isin.iterrows():\n\u001b[32m---> \u001b[39m\u001b[32m63\u001b[39m \u001b[43mplot_isin_dynamics\u001b[49m(\n\u001b[32m 64\u001b[39m df,\n\u001b[32m 65\u001b[39m row[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 66\u001b[39m row[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 67\u001b[39m )\n",
"\u001b[31mNameError\u001b[39m: name 'plot_isin_dynamics' is not defined"
]
}
],
"source": [
"# Option B (alternative) : les plus sévères\n",
"# sample_isin = problematic_isin.sort_values(\n",
"# \"rupture_ratio\", ascending=False\n",
"# ).head(10)\n",
"\n",
"sample_isin = rupture_isin_summary.sort_values(\n",
" \"rupture_ratio\",\n",
" ascending=False\n",
").head(10)\n",
"\n",
"def plot_isin_dynamics_clean(df, account_id, isin):\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == account_id) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].sort_values(\"Centralisation Date\")\n",
"\n",
" if sub.empty:\n",
" return\n",
"\n",
" fig, ax = plt.subplots(figsize=(7.5, 3))\n",
"\n",
" # AUM observé\n",
" ax.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"Quantity - AUM\"],\n",
" label=\"Observed AUM\",\n",
" linewidth=2,\n",
" color=\"black\"\n",
" )\n",
"\n",
" # AUM attendu\n",
" ax.plot(\n",
" sub[\"Centralisation Date\"],\n",
" sub[\"expected_stock\"],\n",
" label=\"Flow-implied AUM\",\n",
" linestyle=\"--\",\n",
" linewidth=2,\n",
" color=\"grey\"\n",
" )\n",
"\n",
" # Ruptures\n",
" rupt = sub[sub[\"rupture_flag\"]]\n",
" ax.scatter(\n",
" rupt[\"Centralisation Date\"],\n",
" rupt[\"Quantity - AUM\"],\n",
" color=\"red\",\n",
" s=25,\n",
" zorder=5,\n",
" label=\"Discontinuity\"\n",
" )\n",
"\n",
" ax.set_title(f\"Account {account_id} — ISIN {isin}\", fontsize=11)\n",
" ax.set_xlabel(\"\")\n",
" ax.set_ylabel(\"AUM (shares)\")\n",
" ax.legend(loc=\"best\")\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"\n",
"\n",
"for _, row in sample_isin.iterrows():\n",
" plot_isin_dynamics(\n",
" df,\n",
" row[\"Registrar Account - ID\"],\n",
" row[\"Product - Isin\"]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "aef8ceb9-28a6-4908-ae24-a88d85b64309",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"Column(s) ['rupture_flag'] do not exist\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# 1. Aggregate rupture rate over time\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 4\u001b[39m time_stats = (\n\u001b[32m 5\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mtotal_obs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcount\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43mn_ruptures\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m .reset_index()\n\u001b[32m 11\u001b[39m )\n\u001b[32m 13\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mrupture_rate\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 14\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mn_ruptures\u001b[39m\u001b[33m\"\u001b[39m] / time_stats[\u001b[33m\"\u001b[39m\u001b[33mtotal_obs\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 15\u001b[39m )\n\u001b[32m 17\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# 2. Smooth (optional but recommended for readability)\u001b[39;00m\n\u001b[32m 19\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001b[39m, in \u001b[36mDataFrameGroupBy.aggregate\u001b[39m\u001b[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[39m\n\u001b[32m 1429\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m] = engine_kwargs\n\u001b[32m 1431\u001b[39m op = GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args=args, kwargs=kwargs)\n\u001b[32m-> \u001b[39m\u001b[32m1432\u001b[39m result = \u001b[43mop\u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1433\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1434\u001b[39m \u001b[38;5;66;03m# GH #52849\u001b[39;00m\n\u001b[32m 1435\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.as_index \u001b[38;5;129;01mand\u001b[39;00m is_list_like(func):\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:190\u001b[39m, in \u001b[36mApply.agg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 187\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_str()\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(func):\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(func):\n\u001b[32m 192\u001b[39m \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[32m 193\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.agg_list_like()\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:423\u001b[39m, in \u001b[36mApply.agg_dict_like\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 415\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magg_dict_like\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> DataFrame | Series:\n\u001b[32m 416\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 417\u001b[39m \u001b[33;03m Compute aggregation in the case of a dict-like argument.\u001b[39;00m\n\u001b[32m 418\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 421\u001b[39m \u001b[33;03m Result of aggregation.\u001b[39;00m\n\u001b[32m 422\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m423\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_or_apply_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43magg\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001b[39m, in \u001b[36mGroupByApply.agg_or_apply_dict_like\u001b[39m\u001b[34m(self, op_name)\u001b[39m\n\u001b[32m 1598\u001b[39m kwargs.update({\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m: engine, \u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m: engine_kwargs})\n\u001b[32m 1600\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m com.temp_setattr(\n\u001b[32m 1601\u001b[39m obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m, condition=\u001b[38;5;28mhasattr\u001b[39m(obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 1602\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1603\u001b[39m result_index, result_data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompute_dict_like\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1604\u001b[39m \u001b[43m \u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1605\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1606\u001b[39m result = \u001b[38;5;28mself\u001b[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001b[32m 1607\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:462\u001b[39m, in \u001b[36mApply.compute_dict_like\u001b[39m\u001b[34m(self, op_name, selected_obj, selection, kwargs)\u001b[39m\n\u001b[32m 460\u001b[39m is_groupby = \u001b[38;5;28misinstance\u001b[39m(obj, (DataFrameGroupBy, SeriesGroupBy))\n\u001b[32m 461\u001b[39m func = cast(AggFuncTypeDict, \u001b[38;5;28mself\u001b[39m.func)\n\u001b[32m--> \u001b[39m\u001b[32m462\u001b[39m func = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 464\u001b[39m is_non_unique_col = (\n\u001b[32m 465\u001b[39m selected_obj.ndim == \u001b[32m2\u001b[39m\n\u001b[32m 466\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m selected_obj.columns.nunique() < \u001b[38;5;28mlen\u001b[39m(selected_obj.columns)\n\u001b[32m 467\u001b[39m )\n\u001b[32m 469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m selected_obj.ndim == \u001b[32m1\u001b[39m:\n\u001b[32m 470\u001b[39m \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:663\u001b[39m, in \u001b[36mApply.normalize_dictlike_arg\u001b[39m\u001b[34m(self, how, obj, func)\u001b[39m\n\u001b[32m 661\u001b[39m cols = Index(\u001b[38;5;28mlist\u001b[39m(func.keys())).difference(obj.columns, sort=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 662\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m663\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m do not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 665\u001b[39m aggregator_types = (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[32m 667\u001b[39m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[32m 668\u001b[39m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[32m 669\u001b[39m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[32m 670\u001b[39m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
"\u001b[31mKeyError\u001b[39m: \"Column(s) ['rupture_flag'] do not exist\""
]
}
],
"source": [
"# ------------------------------------------------------------\n",
"# 1. Aggregate rupture rate over time\n",
"# ------------------------------------------------------------\n",
"time_stats = (\n",
" df.groupby(\"Centralisation Date\")\n",
" .agg(\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" n_ruptures=(\"rupture_flag\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"time_stats[\"rupture_rate\"] = (\n",
" time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Smooth (optional but recommended for readability)\n",
"# ------------------------------------------------------------\n",
"time_stats[\"rupture_rate_ma\"] = (\n",
" time_stats[\"rupture_rate\"]\n",
" .rolling(window=6, center=True) # 6 periods ≈ half-year\n",
" .mean()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Professional plot\n",
"# ------------------------------------------------------------\n",
"plt.figure(figsize=(12, 5))\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate\"] * 100,\n",
" color=\"lightgray\",\n",
" linewidth=1,\n",
" alpha=0.6,\n",
" label=\"Monthly rupture rate\"\n",
")\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate_ma\"] * 100,\n",
" color=\"#1f77b4\",\n",
" linewidth=2.5,\n",
" label=\"6-month moving average\"\n",
")\n",
"\n",
"plt.ylabel(\"Rupture rate (%)\")\n",
"plt.xlabel(\"Date\")\n",
"\n",
"plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
"plt.legend(frameon=False)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "6624939b-f079-4e02-9989-60462e9f5356",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAHqCAYAAADVi/1VAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzsnXmcE+X9xz8zuZNNspvsAQvLIbcIiFwCKuAFHtT7pFXUetUDL7yq9WhrW+vtz1pbK1iLeFSxHlWrKKiIB8gth3LDwh45N3cy8/z+WGdIdpPdZDfZZJLv+/XixWYymTzP5PnOPM9nvgfHGGMgCIIgCIIgCIIgCIIgiB6Ez3cDCIIgCIIgCIIgCIIgiNKDRCmCIAiCIAiCIAiCIAiixyFRiiAIgiAIgiAIgiAIguhxSJQiCIIgCIIgCIIgCIIgehwSpQiCIAiCIAiCIAiCIIgeh0QpgiAIgiAIgiAIgiAIoschUYogCIIgCIIgCIIgCILocUiUIgiCIAiCIAiCIAiCIHocdb4bkGtEUUR9fT3MZjM4jst3cwiCIAiCIAiCIAiCIIoaxhhaWlpQW1sLnk/tD1X0olR9fT3q6ury3QyCIAiCIAiCIAiCIIiSYu/evejbt2/K94telDKbzQBaT4TFYslza4hcI4oi9u7di7q6ug7VWIIoVsgGiFKHbIAodcgGiFKGxj9R6hSSDXi9XtTV1cmaTCqKXpSSQvYsFguJUiWAKIowm82wWCx5N0KCyAdkA0SpQzZAlDpkA0QpQ+OfKHUK0QY6S6NUGK0kiCzBcRzKy8spfxhRspANEKUO2QBR6pANEKUMjX+i1FGiDRS9pxRRWkhGSBClCtkAUeqQDRClDtkAUcrQ+CdKHSXaAHlKEUWFKIpoaGiAKIr5bgpB5AWyAaLUIRsgSh2yAaKUofFPlDpKtAESpYiiIxgM5rsJBJFXyAaIUodsgCh1yAaIUobGP1HqKM0GSJQiCIIgCIIgCIIgCIIgehwSpQiCIAiCIAiCIAiCIIgeh0QpoqjgOA52u11R1QYIIpuQDRClDtkAUeqQDRClDI1/otRRog1Q9T2iqOA4DmazOd/NIIi8QTZAlDpkA0SpQzZAlDI0/olSR4k2QJ5SRFEhiiL279+vqGoDBJFNyAaIUodsgCh1yAaIUobGP1HqKNEGSJQiio5oNJrvJhBEXiEbIEodsgGi1CEbIEoZGv9EqaM0GyBRiiAIgiAIgiAIgsg5HMfhrbfeSvn+smXLwHEc3G53j7WJ6BpLly7FiBEjIAhCTr9n+vTpuOmmm3L6HYwxXHXVVbDZbOA4DmvXru2R7+1JMrWt5uZmVFdXY9++fbltGEiUIgiCIAiCIAiCKFrmzp0LjuNwzTXXtHvvuuuuA8dxmDt3bla/8/7778eRRx6Z1WMWAgsXLkR5eXm+m9Elst3222+/Hffccw9UKlVWjpdKNHnzzTfx29/+NivfkYoPPvgACxcuxLvvvosDBw7giCOOyOn3KYHKykpccskluO+++3L+XSRKEUUFx3GoqalRVLUBgsgmZANEqUM2QJQ6ZANEMurq6vDKK68gGAzK20KhEF5++WX069cvjy3LLsnGfyQSyWOLUpOtEKt89O+LL77A9u3bcc455+T8u2w2W84Td2/fvh29e/fGlClT0KtXL6jVyq0Hl817wGWXXYZFixbB6XRmoWWpIVGKKCo4joPBYKCJGFGykA0QpQ7ZAFHqkA0QyTjqqKNQV1eHN998U9725ptvol+/fhg7dmzCvuFwGDfeeCOqq6uh1+txzDHH4Ntvv5Xflzxali5divHjx8NoNGLKlCnYunUrgFaPnAceeADr1q0Dx3HgOA4LFy6UP9/c3IyzzjoLRqMRQ4YMwdtvv520zX6/HxaLBf/+978Ttr/11lswmUxoaWlp9xmO43DKKafghhtuwE033YTKykrMnDkTu3btksOyJNxuNziOw7JlyxL69d5772H06NHQ6/U4+uijsXHjRvn9yy67DB6PR+7X/fffL39v27DE8vJyud/S97/66quYNm0a9Ho9Fi1aBAB4/vnnMWLECOj1egwfPhx/+ctfkp4PienTp+P6669P6B8APPbYYxg1ahRMJhPq6urwq1/9Cj6fr9O2h8Nh3HbbbejTpw9MJhMmTZokn5NUvPLKKzjppJOg1+vlbdu3b8cZZ5yBmpoalJWVYcKECfj4448TPhcOh3HHHXegrq4OOp0OgwcPxj/+8Q/s2rULM2bMAABUVFQkeO/Fh9HdfffdmDRpUrv2jBkzBg8++KD8OpNzOnfuXNxwww3Ys2cPOI7DgAEDku7ncrlwySWXoKKiAkajEaeccgp++OEHAK3hf1VVVQlj9cgjj0Tv3r3l11988QV0Oh0CgQAYY7j//vvRr18/6HQ61NbW4sYbb0zZRgB49tlnMWjQIGi1WgwbNgwvvfRSwvscx+H555/H2WefDbvdjqFDh3bbtkaOHIna2losWbKkw7Z1FxKliKJCFEXs3r1bUdUGCCKbkA0QpQ7ZAFHqkA0Qqbj88suxYMEC+fULL7yAyy67rN1+t99+O9544w28+OKL+O677zB48GDMnDmznbfEr3/9azz66KNYtWoV1Go1Lr/8cgDABRdcgFtvvRUjR47EgQMHcODAAVxwwQXy5x544AGcf/75WL9+PU499VTMmTMnqSeGyWTChRdemNBmAFiwYAHOPffcpN4zoigiFArhxRdfhFarxYoVK/DXv/41o/M0f/58PProo/j2229RVVWF2bNnIxqNYsqUKXjiiSdgsVjkft12220ZHfvOO+/EvHnzsHnzZsycOROLFi3Cb37zG/z+97/H5s2b8dBDD+Hee+/Fiy++2OFxkvWP53k89dRT2LRpE1588UV88sknuP322wGgw7Zff/31WLlyJV555RWsX78e5513HmbNmiULLsn4/PPPMX78+IRtPp8Pp556KpYuXYo1a9Zg1qxZmD17Nvbs2SPvc8kll2Dx4sV46qmnsHnzZjz33HMoKytDXV0d3njjDQDA1q1bceDAATz55JPtvnfOnDn45ptvsH37dnnbpk2bsH79elx88cUAkPE5ffLJJ/Hggw+ib9++OHDgQIIAG8/cuXOxatUqvP3221i5ciUYYzj11FMRjUbBcRyOO+44WcxzuVzYvHkzgsEgtmzZAgBYvnw5JkyYAKPRiDfeeAOPP/44nnvuOfzwww946623MGrUqJTne8mSJZg3bx5uvfVWbNy4EVdffTUuu+wyfPrppwn7PfDAAzj33HPx/vvv45RTTsmKbU2cOBGff/55yrZlBVbkeDweBoB5PJ58N6XbhMNhFo1G892MgkYQBLZz504mCEK+m0IQeYFsgCh1yAaIUodsoOcRBIGFw+Ee/ZfJ73vppZeyM844gzU2NjKdTsd27drFdu3axfR6PWtqamJnnHEGu/TSSxljjPl8PqbRaNiiRYvkz0ciEVZbW8sefvhhxhhjn376KQPAPv74Y3mf9957jwFgwWCQMcbYfffdx8aMGdOuLQDYPffcI7/2+XwMAHv//fcTju1yuRhjjH399ddMpVKx+vp6xhhjDQ0NTK1Ws2XLlqX8LSZNmsTGjh2bsH3nzp0MAFuzZo28zeVyMQDs008/TfjuV155Rd7H4XAwg8HAXn31VcYYYwsWLGBWqzVpv5YsWZKwzWq1sgULFiR8/xNPPJGwz6BBg9jLL7+csO23v/0tmzx5ctL+McbYtGnT2vUvGa+//jqz2+3y62Rt3717N1OpVGz//v0J20844QR21113pTy21Wpl//znPzttw8iRI9nTTz/NGGNs69atDAD76KOPku7b9reXmDZtGps3b578esyYMezBBx+UX991111s0qRJ8uuunNPHH3+c9e/fP+X3btu2jQFgK1askN9vbm5mBoOBvfbaa4wxxp566ik2cuRIxhhjb731Fps0aRI744wz2LPPPssYY+zEE09kd999N2OMsUcffZQNHTqURSKRlG2KZ8qUKezKK69M2HbeeeexU089VX4t2ZZ0D/B6vVmxrZtvvplNnz49rXa2JV0tRrnBkiU
"text/plain": [
"<Figure size 1200x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ------------------------------------------------------------\n",
"# 1. Aggregate rupture rate over time\n",
"# CORRECTED: denominator = only observations with active flows\n",
"# ------------------------------------------------------------\n",
"\n",
"# Active observations = at least one non-zero flow at this date\n",
"active_obs = df[df[\"Quantity - NetFlows\"] != 0]\n",
"\n",
"time_stats = (\n",
" active_obs\n",
" .groupby(\"Centralisation Date\")\n",
" .agg(\n",
" total_obs=(\"rupture_flag\", \"count\"), # only active flow observations\n",
" n_ruptures=(\"rupture_flag\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"time_stats[\"rupture_rate\"] = (\n",
" time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Smooth (6-month moving average)\n",
"# ------------------------------------------------------------\n",
"time_stats[\"rupture_rate_ma\"] = (\n",
" time_stats[\"rupture_rate\"]\n",
" .rolling(window=6, center=True)\n",
" .mean()\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Plot\n",
"# ------------------------------------------------------------\n",
"plt.figure(figsize=(12, 5))\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate\"] * 100,\n",
" color=\"lightgray\",\n",
" linewidth=1,\n",
" alpha=0.6,\n",
" label=\"Monthly rupture rate (active flows only)\"\n",
")\n",
"\n",
"plt.plot(\n",
" time_stats[\"Centralisation Date\"],\n",
" time_stats[\"rupture_rate_ma\"] * 100,\n",
" color=\"#1f77b4\",\n",
" linewidth=2.5,\n",
" label=\"6-month moving average\"\n",
")\n",
"\n",
"plt.ylabel(\"Rupture rate (%)\")\n",
"plt.xlabel(\"Date\")\n",
"plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
"plt.legend(frameon=False)\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d6ee0c24-e14e-4c40-97d4-49879229790c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/1047489516.py:6: FutureWarning:\n",
"\n",
"DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"has_reset\n",
"True 64192\n",
"False 15545\n",
"Name: count, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"EPS = 1e-6 # seuil numérique\n",
"\n",
"reset_candidates = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .apply(\n",
" lambda g: (\n",
" (g[\"Quantity - AUM\"].abs() < EPS) &\n",
" (g[\"expected_stock\"].abs() < EPS)\n",
" ).any()\n",
" )\n",
" .reset_index(name=\"has_reset\")\n",
")\n",
"\n",
"reset_candidates[\"has_reset\"].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "601f61b8-0115-431d-97de-6ec5a0f1d4f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Before repair After repair Repaired points\n",
"0 756392 22357 18440\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/3061846510.py:66: FutureWarning:\n",
"\n",
"DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
"\n"
]
}
],
"source": [
"GAP_TOL = 1e-6\n",
"REL_GAP_THR = 0.05\n",
"MIN_PERSISTENCE = 3\n",
"\n",
"df = merged_isin.copy().sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"df[\"corrected_aum\"] = df[\"Quantity - AUM\"]\n",
"df[\"repair_flag\"] = False\n",
"\n",
"def repair_group(g):\n",
" g = g.copy()\n",
"\n",
" obs = g[\"Quantity - AUM\"].values\n",
" flows = g[\"Quantity - NetFlows\"].values\n",
"\n",
" corrected = obs.copy()\n",
"\n",
" # Initial expected path\n",
" expected = np.empty_like(obs)\n",
" expected[0] = np.nan\n",
"\n",
" for t in range(1, len(obs)):\n",
" expected[t] = corrected[t-1] + flows[t-1]\n",
"\n",
" gap = obs - expected\n",
" rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
"\n",
" idx = None\n",
"\n",
" for i in range(1, len(obs) - MIN_PERSISTENCE):\n",
" if (\n",
" rel_gap[i] > REL_GAP_THR\n",
" and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
" and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
" ):\n",
" idx = i\n",
" break\n",
"\n",
" if idx is None:\n",
" return g\n",
"\n",
" # Apply correction\n",
" shift = gap[idx]\n",
" corrected[idx:] = obs[idx:] - shift\n",
"\n",
" g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
"\n",
" # Rebuild expected stock AFTER correction\n",
" expected_corr = np.empty_like(obs)\n",
" expected_corr[0] = np.nan\n",
"\n",
" for t in range(1, len(obs)):\n",
" expected_corr[t] = corrected[t-1] + flows[t-1]\n",
"\n",
" g[\"corrected_aum\"] = corrected\n",
" g[\"expected_stock_corr\"] = expected_corr\n",
"\n",
" return g\n",
"\n",
"\n",
"df = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"], group_keys=False)\n",
" .apply(repair_group)\n",
")\n",
"\n",
"# Recompute gaps & ruptures\n",
"df[\"gap_before\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
"df[\"gap_after\"] = df[\"corrected_aum\"] - df[\"expected_stock_corr\"]\n",
"\n",
"df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_TOL\n",
"df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_TOL\n",
"\n",
"summary = pd.DataFrame({\n",
" \"Before repair\": [df[\"rupture_before\"].sum()],\n",
" \"After repair\": [df[\"rupture_after\"].sum()],\n",
" \"Repaired points\": [df[\"repair_flag\"].sum()]\n",
"})\n",
"\n",
"print(summary)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "62583cfe-a6e7-4931-a63e-4273dca97ff7",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df_final' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplotly\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgraph_objects\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mgo\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df_final = \u001b[43mdf_final\u001b[49m.rename(columns={\n\u001b[32m 5\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 6\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 7\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 8\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 9\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 10\u001b[39m })\n\u001b[32m 12\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 13\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m]\n",
"\u001b[31mNameError\u001b[39m: name 'df_final' is not defined"
]
}
],
"source": [
"import plotly.graph_objects as go\n",
"import pandas as pd\n",
"\n",
"# ============================================================\n",
"# Parameters (fixed epsilon)\n",
"# ============================================================\n",
"GAP_EPS = 100 # fixed tolerance for accounting identity\n",
"\n",
"# ============================================================\n",
"# 1. Define ruptures using a FIXED epsilon\n",
"# ============================================================\n",
"df = df.copy()\n",
"\n",
"df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_EPS\n",
"df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_EPS\n",
"\n",
"# ============================================================\n",
"# 2. Rupture ratios BEFORE repair\n",
"# ============================================================\n",
"rupture_summary_before = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_obs=(\"rupture_before\", \"count\"),\n",
" n_ruptures=(\"rupture_before\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"rupture_summary_before[\"rupture_ratio\"] = (\n",
" rupture_summary_before[\"n_ruptures\"] /\n",
" rupture_summary_before[\"n_obs\"]\n",
")\n",
"\n",
"# ============================================================\n",
"# 3. Rupture ratios AFTER repair\n",
"# ============================================================\n",
"rupture_summary_after = (\n",
" df\n",
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
" .agg(\n",
" n_obs=(\"rupture_after\", \"count\"),\n",
" n_ruptures=(\"rupture_after\", \"sum\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"rupture_summary_after[\"rupture_ratio\"] = (\n",
" rupture_summary_after[\"n_ruptures\"] /\n",
" rupture_summary_after[\"n_obs\"]\n",
")\n",
"\n",
"# ============================================================\n",
"# 4. Rupture intensity classes (fixed bins)\n",
"# ============================================================\n",
"bins = [0.0, 0.01, 0.10, 0.30, 1.0]\n",
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (110%)\",\n",
" \"High (1030%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
"\n",
"rupture_summary_before[\"rupture_class\"] = pd.cut(\n",
" rupture_summary_before[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"rupture_summary_after[\"rupture_class\"] = pd.cut(\n",
" rupture_summary_after[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"# ============================================================\n",
"# 5. Distribution (%)\n",
"# ============================================================\n",
"dist_before = (\n",
" rupture_summary_before[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"dist_after = (\n",
" rupture_summary_after[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"# ============================================================\n",
"# 6. Donut chart: BEFORE vs AFTER (fixed epsilon)\n",
"# ============================================================\n",
"fig = go.Figure()\n",
"\n",
"fig.add_trace(go.Pie(\n",
" labels=dist_before.index,\n",
" values=dist_before.values,\n",
" hole=0.45,\n",
" name=\"Before repair\",\n",
" domain=dict(x=[0.0, 0.48]),\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
"))\n",
"\n",
"fig.add_trace(go.Pie(\n",
" labels=dist_after.index,\n",
" values=dist_after.values,\n",
" hole=0.45,\n",
" name=\"After repair\",\n",
" domain=dict(x=[0.52, 1.0]),\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
"))\n",
"\n",
"fig.update_layout(\n",
" title=\"Distribution of AUMflow rupture intensity before vs after repair (fixed ε)\",\n",
" annotations=[\n",
" dict(text=\"Before repair\", x=0.24, y=0.5, showarrow=False),\n",
" dict(text=\"After repair\", x=0.76, y=0.5, showarrow=False),\n",
" ],\n",
" legend=dict(\n",
" orientation=\"h\",\n",
" yanchor=\"top\",\n",
" y=-0.15,\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "70cf0a99-bd19-41a9-9574-88647fde09ca",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[31]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 5\u001b[39m df_final = df.copy()\n\u001b[32m 7\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# Core variables (before / after)\u001b[39;00m\n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m df_final = \u001b[43mdf_final\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - AUM\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 15\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 16\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - NetFlows\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 17\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock_corr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_before\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_after\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 22\u001b[39m \u001b[43m]\u001b[49m\u001b[43m]\u001b[49m.rename(columns={\n\u001b[32m 23\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 24\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 25\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 26\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 27\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 28\u001b[39m })\n\u001b[32m 30\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 31\u001b[39m \u001b[38;5;66;03m# Relative gaps\u001b[39;00m\n\u001b[32m 32\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m 33\u001b[39m df_final[\u001b[33m\"\u001b[39m\u001b[33mgap_rel_before\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 34\u001b[39m df_final[\u001b[33m\"\u001b[39m\u0
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4117\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m 4118\u001b[39m key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m 4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m 4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m 6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 6210\u001b[39m keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m 6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m 6216\u001b[39m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m 6261\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mKeyError\u001b[39m: \"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\""
]
}
],
"source": [
"# ============================================================\n",
"# FINAL DATASETS AFTER REPAIR\n",
"# ============================================================\n",
"\n",
"df_final = df.copy()\n",
"\n",
"# ------------------------------------------------------------\n",
"# Core variables (before / after)\n",
"# ------------------------------------------------------------\n",
"df_final = df_final[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM\",\n",
" \"corrected_aum\",\n",
" \"Quantity - NetFlows\",\n",
" \"expected_stock\",\n",
" \"expected_stock_corr\",\n",
" \"gap_before\",\n",
" \"gap_after\",\n",
" \"repair_flag\"\n",
"]].rename(columns={\n",
" \"Quantity - AUM\": \"aum_raw\",\n",
" \"corrected_aum\": \"aum_repaired\",\n",
" \"Quantity - NetFlows\": \"flows\",\n",
" \"expected_stock\": \"expected_aum_raw\",\n",
" \"expected_stock_corr\": \"expected_aum_repaired\"\n",
"})\n",
"\n",
"# ------------------------------------------------------------\n",
"# Relative gaps\n",
"# ------------------------------------------------------------\n",
"df_final[\"gap_rel_before\"] = (\n",
" df_final[\"gap_before\"].abs() /\n",
" df_final[\"expected_aum_raw\"].abs().clip(lower=1)\n",
")\n",
"\n",
"df_final[\"gap_rel_after\"] = (\n",
" df_final[\"gap_after\"].abs() /\n",
" df_final[\"expected_aum_repaired\"].abs().clip(lower=1)\n",
")\n",
"df_final.to_csv('df_repaired.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "befb2962-73fb-4cb8-b86e-3218ec103204",
"metadata": {},
"outputs": [],
"source": [
"# ============================================================\n",
"# TYPE 3 REPAIR — TEMPORARY RESET TO ZERO (ONE BLOCK)\n",
"# ============================================================\n",
"\n",
"df_type3 = df_repaired.copy()\n",
"df_type3 = df_type3.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"# Create lead/lag variables\n",
"df_type3[\"aum_prev\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df_type3[\"aum_next\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(-1)\n",
"\n",
"df_type3[\"flow_prev\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1)\n",
"\n",
"df_type3[\"flow_next\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(-1)\n",
"\n",
"# ------------------------------------------------------------\n",
"# Detection of temporary reset\n",
"# ------------------------------------------------------------\n",
"df_type3[\"type3_flag\"] = (\n",
" (df_type3[\"Quantity - AUM\"] == 0)\n",
" & (df_type3[\"aum_prev\"] > 0)\n",
" & (df_type3[\"aum_next\"] == df_type3[\"aum_prev\"])\n",
" & (df_type3[\"flow_prev\"].fillna(0) == 0)\n",
" & (df_type3[\"Quantity - NetFlows\"] == 0)\n",
" & (df_type3[\"flow_next\"].fillna(0) == 0)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Repair: smooth the glitch (replace 0 by previous stock)\n",
"# ------------------------------------------------------------\n",
"df_type3.loc[df_type3[\"type3_flag\"], \"Quantity - AUM\"] = (\n",
" df_type3.loc[df_type3[\"type3_flag\"], \"aum_prev\"]\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Recompute temporal chain AFTER repair\n",
"# ------------------------------------------------------------\n",
"df_type3[\"prev_stock\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df_type3[\"prev_flows\"] = df_type3.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"df_type3[\"expected_stock\"] = (\n",
" df_type3[\"prev_stock\"] + df_type3[\"prev_flows\"]\n",
")\n",
"\n",
"df_type3[\"gap\"] = df_type3[\"Quantity - AUM\"] - df_type3[\"expected_stock\"]\n",
"df_type3[\"gap_abs\"] = df_type3[\"gap\"].abs()\n",
"df_type3[\"gap_rel\"] = (\n",
" df_type3[\"gap_abs\"] /\n",
" df_type3[\"expected_stock\"].abs().clip(lower=1)\n",
")\n",
"\n",
"df_type3[\"rupture_flag\"] = (\n",
" df_type3[\"prev_stock\"].notna()\n",
" & (df_type3[\"gap_abs\"] > TAU_ABS)\n",
" & (df_type3[\"gap_rel\"] > TAU_REL)\n",
")\n",
"\n",
"# ------------------------------------------------------------\n",
"# Diagnostic output\n",
"# ------------------------------------------------------------\n",
"n_type3 = df_type3[\"type3_flag\"].sum()\n",
"print(f\"Temporary reset glitches repaired (Type 3): {n_type3}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fc44ed4-829f-4a8a-985a-31350bdbdf6d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# ------------------------------------------------------------\n",
"# 1. Sélection des ISIN avec exactement 1 rupture\n",
"# ------------------------------------------------------------\n",
"one_rupture_isin = rupture_isin_summary[\n",
" rupture_isin_summary[\"n_ruptures\"] == 1\n",
"][[\"Registrar Account - ID\", \"Product - Isin\"]].head(100)\n",
"\n",
"results = []\n",
"\n",
"# ------------------------------------------------------------\n",
"# 2. Boucle de correction test\n",
"# ------------------------------------------------------------\n",
"for _, row in one_rupture_isin.iterrows():\n",
" acc = row[\"Registrar Account - ID\"]\n",
" isin = row[\"Product - Isin\"]\n",
"\n",
" sub = df[\n",
" (df[\"Registrar Account - ID\"] == acc) &\n",
" (df[\"Product - Isin\"] == isin)\n",
" ].sort_values(\"Centralisation Date\").copy()\n",
"\n",
" # Localiser la rupture\n",
" rupture_idx = sub.index[sub[\"rupture_flag\"]]\n",
"\n",
" if sub.index.get_loc(rupture_idx[0]) > 1:\n",
" #print(sub[[\"Centralisation Date\", \"Quantity - AUM\", \"expected_stock\", \"gap\", \"rupture_flag\"]].head(100))\n",
" continue\n",
"\n",
" # Vérifier si la rupture est à la première date\n",
" first_idx = sub.index[0]\n",
" if rupture_idx[0] != first_idx:\n",
" continue\n",
"\n",
" # ----- Réparation : décaler expected_stock -----\n",
" sub[\"expected_stock_fixed\"] = sub[\"expected_stock\"].shift(-1)\n",
"\n",
" # Recalcul des gaps\n",
" sub[\"gap_fixed\"] = sub[\"Quantity - AUM\"] - sub[\"expected_stock_fixed\"]\n",
" sub[\"gap_abs_fixed\"] = sub[\"gap_fixed\"].abs()\n",
" sub[\"gap_rel_fixed\"] = sub[\"gap_abs_fixed\"] / sub[\"expected_stock_fixed\"].abs().clip(lower=1)\n",
"\n",
" # Recalcul rupture\n",
" sub[\"rupture_fixed\"] = (\n",
" sub[\"expected_stock_fixed\"].notna()\n",
" & (sub[\"gap_abs_fixed\"] > TAU_ABS)\n",
" & (sub[\"gap_rel_fixed\"] > TAU_REL)\n",
" )\n",
"\n",
" results.append({\n",
" \"Registrar Account - ID\": acc,\n",
" \"Product - Isin\": isin,\n",
" \"ruptures_before\": sub[\"rupture_flag\"].sum(),\n",
" \"ruptures_after\": sub[\"rupture_fixed\"].sum()\n",
" })\n",
"\n",
"# ------------------------------------------------------------\n",
"# 3. Résultats agrégés\n",
"# ------------------------------------------------------------\n",
"repair_test = pd.DataFrame(results)\n",
"\n",
"summary = repair_test.groupby(\n",
" [\"ruptures_before\", \"ruptures_after\"]\n",
").size().reset_index(name=\"count\")\n",
"\n",
"repair_test, summary\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "d85728ca-55ba-4266-b881-23536eee4ba3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['corrected_aum'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[50]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 10\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m] = pd.to_datetime(\n\u001b[32m 11\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 12\u001b[39m )\n\u001b[32m 14\u001b[39m \u001b[38;5;66;03m# 2. Build repair map\u001b[39;00m\n\u001b[32m 15\u001b[39m repair_map = (\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 17\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 22\u001b[39m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 23\u001b[39m .rename(columns={\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM repaired\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m 24\u001b[39m )\n\u001b[32m 26\u001b[39m \u001b[38;5;66;03m# 3. Merge repaired quantities\u001b[39;00m\n\u001b[32m 27\u001b[39m stocks_repaired = stocks_repaired.merge(\n\u001b[32m 28\u001b[39m repair_map,\n\u001b[32m 29\u001b[39m on=[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 30\u001b[39m how=\u001b[33m\"\u001b[39m\u001b[33mleft\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 31\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4117\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m 4118\u001b[39m key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m 4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m 4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m 6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 6210\u001b[39m keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m 6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m 6216\u001b[39m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m 6261\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mKeyError\u001b[39m: \"['corrected_aum'] not in index\""
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# ============================================================\n",
"# Rebuild STOCKS dataset using repaired AUM quantities\n",
"# ============================================================\n",
"\n",
"# 1. Copy original stocks\n",
"stocks_repaired = stocks.copy()\n",
"stocks_repaired[\"Centralisation Date\"] = pd.to_datetime(\n",
" stocks_repaired[\"Centralisation Date\"]\n",
")\n",
"\n",
"# 2. Build repair map\n",
"repair_map = (\n",
" df[[\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"corrected_aum\",\n",
" \"repair_flag\"\n",
" ]]\n",
" .rename(columns={\"corrected_aum\": \"Quantity - AUM repaired\"})\n",
")\n",
"\n",
"# 3. Merge repaired quantities\n",
"stocks_repaired = stocks_repaired.merge(\n",
" repair_map,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"# 4. Store original quantity\n",
"stocks_repaired[\"Quantity - AUM original\"] = stocks_repaired[\"Quantity - AUM\"]\n",
"\n",
"# 5. Replace Quantity - AUM where repaired\n",
"stocks_repaired[\"Quantity - AUM\"] = np.where(\n",
" stocks_repaired[\"repair_flag\"] == True,\n",
" stocks_repaired[\"Quantity - AUM repaired\"],\n",
" stocks_repaired[\"Quantity - AUM\"]\n",
")\n",
"\n",
"# 6. Recompute monetary values (unit value unchanged)\n",
"stocks_repaired[\"nav_ccy\"] = (\n",
" stocks_repaired[\"Value - AUM CCY\"] /\n",
" stocks_repaired[\"Quantity - AUM original\"]\n",
")\n",
"\n",
"stocks_repaired[\"nav_eur\"] = (\n",
" stocks_repaired[\"Value - AUM €\"] /\n",
" stocks_repaired[\"Quantity - AUM original\"]\n",
")\n",
"\n",
"stocks_repaired[\"Value - AUM CCY\"] = (\n",
" stocks_repaired[\"Quantity - AUM\"] *\n",
" stocks_repaired[\"nav_ccy\"]\n",
")\n",
"\n",
"stocks_repaired[\"Value - AUM €\"] = (\n",
" stocks_repaired[\"Quantity - AUM\"] *\n",
" stocks_repaired[\"nav_eur\"]\n",
")\n",
"\n",
"# 7. Cleanup helper columns\n",
"stocks_repaired = stocks_repaired.drop(\n",
" columns=[\n",
" \"Quantity - AUM repaired\",\n",
" \"Quantity - AUM original\",\n",
" \"nav_ccy\",\n",
" \"nav_eur\"\n",
" ]\n",
")\n",
"\n",
"# ============================================================\n",
"# Sanity checks (CORRECT WAY)\n",
"# ============================================================\n",
"\n",
"# Share of observations repaired\n",
"repair_share = stocks_repaired[\"repair_flag\"].mean()\n",
"\n",
"# Ensure only repaired points were modified\n",
"n_modified = stocks_repaired[\"repair_flag\"].sum()\n",
"\n",
"print(f\"Share of repaired observations: {repair_share:.4%}\")\n",
"print(f\"Number of repaired rows: {n_modified:,}\")\n",
"\n",
"stocks_repaired.to_csv('AUM_repaired.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f262605-49e8-4304-b11e-38c8bcfc6e3f",
"metadata": {},
"outputs": [],
"source": [
"print(stocks[\"Registrar Account - ID\"].nunique())\n",
"print(df[\"Registrar Account - ID\"].nunique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "37e9b599-aa51-4e03-b23c-2dd24e77fe38",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"AUM_repaired.csv\")\n",
"\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5cfb4526-7435-4e4a-ae48-0a8d40e39d81",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/55327206.py:8: DtypeWarning:\n",
"\n",
"Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n",
"/tmp/ipykernel_1311/55327206.py:9: DtypeWarning:\n",
"\n",
"Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merged dataset size: (9033269, 6)\n",
"\n",
"NUMBER OF MODIFIED OBSERVATIONS: 2263602\n",
"Share modified: 25.06 %\n",
"\n",
"NEGATIVE AUM\n",
"Before repair: 34374\n",
"After repair : 36320\n",
"\n",
"RAW AUM DISTRIBUTION\n",
"count 9.033269e+06\n",
"mean 9.106935e+03\n",
"std 1.915018e+05\n",
"min -9.918641e+06\n",
"25% 0.000000e+00\n",
"50% 0.000000e+00\n",
"75% 3.091340e+02\n",
"max 4.256300e+07\n",
"Name: Quantity - AUM_raw, dtype: float64\n",
"\n",
"REPAIRED AUM DISTRIBUTION\n",
"count 9.033269e+06\n",
"mean 9.104329e+03\n",
"std 1.914988e+05\n",
"min -9.918641e+06\n",
"25% 0.000000e+00\n",
"50% 0.000000e+00\n",
"75% 3.088430e+02\n",
"max 4.256300e+07\n",
"Name: Quantity - AUM_repaired, dtype: float64\n",
"\n",
"TOTAL AUM\n",
"Raw total : 82265397351.45718\n",
"Repaired total : 82241848877.5126\n",
"\n",
"TOP 20 AUM CHANGES\n",
" Registrar Account - ID Product - Isin Centralisation Date \\\n",
"8532368 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532369 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532370 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477988 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477987 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477986 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477989 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532371 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477994 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477996 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477997 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928641 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928642 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928643 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8928644 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8477995 OFF DISTRIBUTION LU0992627611 2021-12-31 \n",
"8532359 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8713983 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8713984 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"8532357 OFF DISTRIBUTION LU0992627611 2021-11-30 \n",
"\n",
" Quantity - AUM_raw Quantity - AUM_repaired aum_diff \n",
"8532368 41251.971 5298781.613 5257529.642 \n",
"8532369 41251.971 5298781.613 5257529.642 \n",
"8532370 41251.971 5298781.613 5257529.642 \n",
"8477988 5298781.613 41251.971 -5257529.642 \n",
"8477987 5298781.613 41251.971 -5257529.642 \n",
"8477986 5298781.613 41251.971 -5257529.642 \n",
"8477989 5298781.613 41251.971 -5257529.642 \n",
"8532371 41251.971 5298781.613 5257529.642 \n",
"8477994 5298781.613 128141.894 -5170639.719 \n",
"8477996 5298781.613 128141.894 -5170639.719 \n",
"8477997 5298781.613 128141.894 -5170639.719 \n",
"8928641 128141.894 5298781.613 5170639.719 \n",
"8928642 128141.894 5298781.613 5170639.719 \n",
"8928643 128141.894 5298781.613 5170639.719 \n",
"8928644 128141.894 5298781.613 5170639.719 \n",
"8477995 5298781.613 128141.894 -5170639.719 \n",
"8532359 41251.971 5059704.980 5018453.009 \n",
"8713983 5059704.980 41251.971 -5018453.009 \n",
"8713984 5059704.980 41251.971 -5018453.009 \n",
"8532357 41251.971 5059704.980 5018453.009 \n",
"\n",
"ISIN WITH MOST MODIFICATIONS\n",
"Product - Isin\n",
"LU1623762769 0.535539\n",
"LU2490324410 0.525588\n",
"FR0013516044 0.524862\n",
"LU2931971050 0.500000\n",
"LU2931971217 0.500000\n",
"FR001400TVB3 0.500000\n",
"FR001400TU23 0.500000\n",
"FR00140139F6 0.500000\n",
"FR001400TVD9 0.500000\n",
"LU2931971134 0.500000\n",
"Name: aum_diff, dtype: float64\n",
"\n",
"REPAIR FLAG ERRORS: 2260454\n",
"\n",
"==============================\n",
"COMPARISON COMPLETED\n",
"==============================\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# ============================================================\n",
"# LOAD DATA\n",
"# ============================================================\n",
"\n",
"aum_raw = pd.read_csv(\"stocks.csv\") # fichier original\n",
"aum_rep = pd.read_csv(\"AUM_repaired.csv\") # fichier réparé\n",
"\n",
"aum_raw[\"Centralisation Date\"] = pd.to_datetime(aum_raw[\"Centralisation Date\"])\n",
"aum_rep[\"Centralisation Date\"] = pd.to_datetime(aum_rep[\"Centralisation Date\"])\n",
"\n",
"\n",
"# ============================================================\n",
"# KEEP SAME KEYS\n",
"# ============================================================\n",
"\n",
"keys = [\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\"\n",
"]\n",
"\n",
"aum_raw = aum_raw[keys + [\"Quantity - AUM\"]]\n",
"aum_rep = aum_rep[keys + [\"Quantity - AUM\", \"repair_flag\"]]\n",
"\n",
"\n",
"# ============================================================\n",
"# MERGE DATASETS\n",
"# ============================================================\n",
"\n",
"df = aum_raw.merge(\n",
" aum_rep,\n",
" on=keys,\n",
" how=\"inner\",\n",
" suffixes=(\"_raw\", \"_repaired\")\n",
")\n",
"\n",
"print(\"Merged dataset size:\", df.shape)\n",
"\n",
"\n",
"# ============================================================\n",
"# 1. HOW MANY VALUES CHANGED\n",
"# ============================================================\n",
"\n",
"df[\"aum_diff\"] = df[\"Quantity - AUM_repaired\"] - df[\"Quantity - AUM_raw\"]\n",
"\n",
"n_changed = (df[\"aum_diff\"] != 0).sum()\n",
"\n",
"print(\"\\nNUMBER OF MODIFIED OBSERVATIONS:\", n_changed)\n",
"print(\"Share modified:\", round(n_changed / len(df) * 100, 2), \"%\")\n",
"\n",
"\n",
"# ============================================================\n",
"# 2. NEGATIVE AUM BEFORE / AFTER\n",
"# ============================================================\n",
"\n",
"neg_before = (df[\"Quantity - AUM_raw\"] < 0).sum()\n",
"neg_after = (df[\"Quantity - AUM_repaired\"] < 0).sum()\n",
"\n",
"print(\"\\nNEGATIVE AUM\")\n",
"print(\"Before repair:\", neg_before)\n",
"print(\"After repair :\", neg_after)\n",
"\n",
"\n",
"# ============================================================\n",
"# 3. DISTRIBUTION COMPARISON\n",
"# ============================================================\n",
"\n",
"print(\"\\nRAW AUM DISTRIBUTION\")\n",
"print(df[\"Quantity - AUM_raw\"].describe())\n",
"\n",
"print(\"\\nREPAIRED AUM DISTRIBUTION\")\n",
"print(df[\"Quantity - AUM_repaired\"].describe())\n",
"\n",
"\n",
"# ============================================================\n",
"# 4. TOTAL AUM COMPARISON\n",
"# ============================================================\n",
"\n",
"print(\"\\nTOTAL AUM\")\n",
"\n",
"print(\"Raw total :\", df[\"Quantity - AUM_raw\"].sum())\n",
"print(\"Repaired total :\", df[\"Quantity - AUM_repaired\"].sum())\n",
"\n",
"\n",
"# ============================================================\n",
"# 5. LARGEST MODIFICATIONS\n",
"# ============================================================\n",
"\n",
"largest_changes = df.sort_values(\n",
" \"aum_diff\",\n",
" key=lambda x: x.abs(),\n",
" ascending=False\n",
").head(20)\n",
"\n",
"print(\"\\nTOP 20 AUM CHANGES\")\n",
"\n",
"print(\n",
" largest_changes[\n",
" [\n",
" \"Registrar Account - ID\",\n",
" \"Product - Isin\",\n",
" \"Centralisation Date\",\n",
" \"Quantity - AUM_raw\",\n",
" \"Quantity - AUM_repaired\",\n",
" \"aum_diff\"\n",
" ]\n",
" ]\n",
")\n",
"\n",
"\n",
"# ============================================================\n",
"# 6. WHICH ISIN WERE MOST MODIFIED\n",
"# ============================================================\n",
"\n",
"isin_changes = (\n",
" df.groupby(\"Product - Isin\")[\"aum_diff\"]\n",
" .apply(lambda x: (x != 0).mean())\n",
" .sort_values(ascending=False)\n",
" .head(10)\n",
")\n",
"\n",
"print(\"\\nISIN WITH MOST MODIFICATIONS\")\n",
"print(isin_changes)\n",
"\n",
"\n",
"# ============================================================\n",
"# 7. CHECK REPAIR FLAG CONSISTENCY\n",
"# ============================================================\n",
"\n",
"if \"repair_flag\" in df.columns:\n",
"\n",
" repair_flag_errors = (\n",
" (df[\"repair_flag\"] == False) &\n",
" (df[\"Quantity - AUM_raw\"] != df[\"Quantity - AUM_repaired\"])\n",
" ).sum()\n",
"\n",
" print(\"\\nREPAIR FLAG ERRORS:\", repair_flag_errors)\n",
"\n",
"\n",
"print(\"\\n==============================\")\n",
"print(\"COMPARISON COMPLETED\")\n",
"print(\"==============================\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "976dd82c-5c16-44e6-aa5d-65d085714b25",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1311/1498669893.py:8: DtypeWarning:\n",
"\n",
"Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.graph_objects as go\n",
"\n",
"# ============================================================\n",
"# 1. LOAD DATA\n",
"# ============================================================\n",
"aum = pd.read_csv(\"AUM_repaired.csv\")\n",
"\n",
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
"aum[\"Centralisation Date\"] = pd.to_datetime(aum[\"Centralisation Date\"])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "66c011b5-aed1-428e-bd18-44d8d814c283",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hole": 0.45,
"hoverinfo": "label+percent",
"labels": [
"Clean / quasi-clean (≤1%)",
"Moderate (110%)",
"High (1030%)",
"Severe (>30%)"
],
"textinfo": "percent",
"type": "pie",
"values": {
"bdata": "mpmZmZlZR0BmZmZmZmY7QM3MzMzMzCpAAAAAAAAAKUA=",
"dtype": "f8"
}
}
],
"layout": {
"legend": {
"orientation": "h",
"title": {
"text": "Rupture ratio"
},
"x": 0.5,
"xanchor": "center",
"y": -0.15,
"yanchor": "top"
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermap": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermap"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Rupture intensity distribution (AUM repaired)"
}
}
},
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAwsAAAFoCAYAAAAYWWdnAAAQAElEQVR4AeydB5wcZfnHn929S7lUkpAEAoHQOwYB6SR0kF5FRPhrKCIdaRE1IoYiXURAUBQRpagg0qUpIEV6b6ElJCEJqXe53O3uf3+TvJe5vd273bstM7PffPLezLzleZ/n+74z8z7zvjMbT/MPAhCAAAQgAAEIQAACEIBADgJx4x8EIBAhApgCAQhAAAIQgAAESkcAZ6F0LJEEAQhAAAIQKC0BpEEAAhCoMgGchSo3ANVDAAIQgAAEIAABCNQGgTBaibMQxlZDZwhAAAIQgAAEIAABCFSAAM5CBSBTRVgJoDcEIAABCEAAAhCobQI4C7Xd/lgPAQhAoHYIYCkEIAABCBRNAGehaGQUgAAEIAABCEAAAhCoNgHqrwwBnIXKcKYWCEAAAhCAAAQgAAEIhI4AzkLomiysCqM3BCAAAQhAAAIQgEDYCOAshK3F0BcCEIBAEAigAwQgAAEI1AQBnIWaaGaMhAAEIAABCEAAAvkJkAKBfARwFvKRIR4CEIAABCAAAQhAAAI1TgBnIZQdAKUhAAEIQAACEIAABCBQfgI4C+VnTA0QgAAEOidAKgQgAAEIQCCgBHAWAtowqAUBCEAAAhCAQDgJoDUEokQAZyFKrYktEIAABCAAAQhAAAIQKCEBnAUrIU1EQQACEIAABCAAAQhAIEIEcBYi1JiYAgEImBkQIAABCEAAAhAoGQGchZKhRBAEIAABCEAAAqUmgDwIQKC6BHAWqsuf2iEAAQhAAAIQgAAEIBBYAiV2FgJrJ4pBAAIQgAAEIAABCEAAAkUSwFkoEhjZIVBTBDAWAhCAAAQgAIGaJoCzUNPNj/EQgAAEIFBLBLAVAhCAQLEEcBaKJUZ+CEAAAhCAAAQgAAEIVJ9ARTTAWagIZiqBAAQgAAEIQAACEIBA+AjgLISvzdA4rATQGwIQgAAEIAABCISMAM5CyBoMdSEAAQhAIBgE0AICEIBALRDAWaiFVsZGCEAAAhCAAAQgAIHOCJCWhwDOQh4wREMAAhCAAAQgAAEIQKDWCeAs1HoPCKv96A0BCEAAAhCAAAQgUHYCOAtlR0wFEIAABCDQFQHSIQABCEAgmARwFoLZLmgFAQhAAAIQgAAEwkoAvSNEAGchQo2JKRCAAAQgAAEIQAACECglAZyFUtIMqyz0hgAEIAABCEAAAhCAQA4COAs5oBAFAQhAIMwE0B0CEIAABCBQKgI4C6UiiRwIQAACEIAABCBQegJIhEBVCeAsVBU/lQeZwE233Wd7fPMs+2L23MCoOfHC3wROp0rAeezpl2zDcUfbd0672BqbFleiSuooIwH1Y39b0r5lhI1oCEAAAj0kEHhnwd1ENFDwh633PsHeeOcjq8Q/DRoLrq8SCuWoQzff7uoojiorO3OIrniUBucapPsHE0HQMZdehcARV/GVDYXkd3lUTn1e54CLK8dW9eTTryf9qlS6ivvF19xm++2+rf32irOtoW+fDqLFVjaIl+zpkCETIVvUryQvc9juvxwQ9TcF7StR+ZS/M5mqS+nKp/wqF7YgvaW/3/ZK2zB+m7F2+/WT7K33PrYLrryl0tVTHwQgAAEIdEIg8M6C0/2ayafYG4/f3BbGbzvWDj1ukpV7IOXqZwsBCFSHwBU33OFVfNqxh3jbXH/+++KbXvQqK61oTz33WklnHyT4jn883mGGSYNsxSud0HMCG667uk04Ym977KmXKvYgqOdaIwECEIBA9AmExlnIbgoNHFZdebjdcsdDJR8YZNcVhuPJ5x5jz9x7remGGwZ9O9NxxaGD7YE/XZL3KXJnZcuZVmm9vnv4Xp5zPD7z1LWcdnUmu9r9SjMGGjwess84E/9cumomQA6C9wBh3/He0+kpn0zPlbVbcV8bu75X7p6Hnva27o87dukuPmxbcQ3K+bbvbtvYoIH97da/Phw2jOgbHgJoCgEIFEkgtM5CLjv1pE/T6Voa4E/XYEJT7FqG4OI1CNGyBeVV0FICF/yzFUq7/Prbbf7CRm8mIzuPZEq26nCytVU56SKddKytjpXf1S1Z/rIuj+Jd8OsiOfmC5Eq+ZCiPtjpWvGQ4edpKN+VRkC4TzrjEs092Kl1B5ZSuINukp+Jd8MtQHh2L53Mvve2tK3f59six5j+XPMlXvGT5dddxZzo627RVXn9QnPTQ1h+fa9/Vqfwu3P/os+2yujx+Nsog+a6M2ypOaeIirp31HzH66NPpbdzEUTarrNuXLH9wurj6/PyUT+VV1umhOAVXTrJ1rG0h+qmc8rugcq5ubbPrVz5xkm1vvvuR956F8inkyqv8uYKbMdhqsw1yJXtxz770linsuuPm5vK5cl6GHv4ZOXyIbbbx2qZZBMdBfVUOipwYpRdShcqKh7i49snm4fIo3oXsNlR5yemKq3QUayfHbRWnNKezq1NyXZzTT+2s+l1Zfx5/vNKlk2Q5GW7rZCmPgvrl/1591yW328pxEdMXX3uvw0xOu4wcQAACEIBAxQgE21noBMPMWXNt3vyFtu2WG+dcw9xJ0XZJGigpwi1x0nKnEyde1ba8SU93Tz/uUBvYv8FbU+vydfdp790PPmWXXfcX+9cdl3tPjd0abN14xx10qulG6erQGt6Jk39jumFLx+4E1ffwEy94dUmu7JPNqk/yNBNx42VnefbJTuVR0BNlpetGv/Mhp5sGRIpXePyuK72Bk3/goLwaEJ/yo6vtjOMP8+p7/v7rbOURQ+3sC65vm/3RIEV8lV/pTt60GbPzrlXuTEc91VXINcMku5WmoPryBdm471ETvQGh9HFhnTVWyVekLV4cZY+4unLiqHaT3EL6z6fTZtrh3zu/jVtXM0TKf/qka+2WX05s4yyFpIf4ar/QUIh+2bLU7ho4qx/IZrWj8hz43R93GOBl66oynbW15LggWzQgX3/t1WzM6JEuusNW7axZxo3WHePlU36VU/kOmbsZccSBu3rXm9ffmeJJkHMiO/Qk3Iso4o/OyZ5eA7K5ujbI7gP9+/dtd93Kly+f+rpWiK/aWcFdF9QH1Md1jVK8gq5dOo/U7508nR9aLqrlRcqjoGvfqJHDXJYO2zVWW8lkn2PdIQMREIAABCBQUQKhdBY0CNDNVtPV3blZ+wlrYKcBk4vTwFIh1+DT5enJNtdLmrJH9SnNr4sGyeO3HWsamOV6YleIHpLpbvDKL9sUNADQcVdBywE0+Drv1CPbsrqnf1oe4h8YyKGS4yG9lVkvoh55yG7tloVoeYheYvQ7eZL315vOt003WFPFigq56pAA6SX9/PUoPlfIZaPyrbn6KG06DeLoBqouo9rwqp+d7A673Kr8Pb+fXPASMuWXoyBuEu4YaACroLhyBQ3+NNg9+8TD25YFqf6LzzvOq9K9X+AdZP5k6yqdNags5MnxosbFpgG5HFXVkRHX4b/OC8nSk3/JVj71OXFQ6FCgmxHq0+Mz56LOU9WprexQncWK1DnpHhK4ssVeA7K55rJbcVdn+qF0d/UoTnx0DupcdPH5tro++q8fyufOrckTj2nXZ3Ut1jVZ55Py5bNJOqhNlSdXkNMn+z78+PNcycRBAAIQgECFCYTGWdATsw3HHW0KW+x5vOmGp3W23blZd8bY3cg0SNFgpbO8pUrTTVs3by2jyJapOM2gaCYlO60nx9Nnzml72p9PjgZFGojlGnC75R4zZ3+Zr3hbvGYcXL7hwwZ7a5L1xFIDT5dJ3A/bbyd3WNRWgwv/IEWFtQxFcRrA6DhfcDZq8CId8uXLFy+HQk9B/bMnyrvl2PXaDaQUV84wfOgK3uxQuQdYkq+BnJj77enX0MebRSqkX6lcMX1ajFUmV9DTZ/HXeeLSpZt0lCPn4kqx1eyCzlM5RNq6c6AUsktxDcjXBzQzqeumC7qW9kRnd26Js1+OrsVy2lwfcDZ11n7+8uyHkgBKQwACNUAgNM6CW+ahZQwaCOhTihrolaONKn1z00BaA2rdxN0N3W0VVw4bC5EpB0WDOg3snT5uq6UF0rkQOf48GlDoqbjaULY5efnWO/vL5tuXTD3llWO
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ============================================================\n",
"# 2. PREPARE FLOWS\n",
"# ============================================================\n",
"\n",
"flows_clean = (\n",
" flows\n",
" .groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
" as_index=False\n",
" )[\"Quantity - NetFlows\"]\n",
" .sum()\n",
")\n",
"\n",
"# ============================================================\n",
"# 3. MERGE\n",
"# ============================================================\n",
"\n",
"df = aum.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# ============================================================\n",
"# 4. SORT\n",
"# ============================================================\n",
"\n",
"df = df.sort_values(\n",
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
")\n",
"\n",
"# ============================================================\n",
"# REBUILD ACCOUNTING IDENTITY WITH REPAIRED AUM\n",
"# ============================================================\n",
"\n",
"df[\"prev_aum\"] = df.groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"df[\"prev_flow\"] = df.groupby(\n",
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
"\n",
"# ============================================================\n",
"# COMPUTE GAP\n",
"# ============================================================\n",
"\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"\n",
"EPS = 10\n",
"\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_aum\"].notna()\n",
" & (df[\"gap_abs\"] > EPS)\n",
")\n",
"# ============================================================\n",
"# 6. COMPUTE GAP\n",
"# ============================================================\n",
"\n",
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
"\n",
"EPS = 10\n",
"\n",
"df[\"rupture_flag\"] = (\n",
" df[\"prev_aum\"].notna()\n",
" & (df[\"gap_abs\"] > EPS)\n",
")\n",
"\n",
"# ============================================================\n",
"# 7. BUILD RUPTURE SUMMARY\n",
"# ============================================================\n",
"\n",
"rupture_summary = (\n",
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
" .agg(\n",
" n_ruptures=(\"rupture_flag\",\"sum\"),\n",
" total_obs=(\"rupture_flag\",\"count\"),\n",
" rupture_ratio=(\"rupture_flag\",\"mean\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# ============================================================\n",
"# 8. SAME CLASSIFICATION AS YOUR CODE\n",
"# ============================================================\n",
"\n",
"rs = rupture_summary.copy()\n",
"\n",
"bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
"\n",
"labels = [\n",
" \"Clean / quasi-clean (≤1%)\",\n",
" \"Moderate (110%)\",\n",
" \"High (1030%)\",\n",
" \"Severe (>30%)\"\n",
"]\n",
"\n",
"rs[\"rupture_class\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=bins,\n",
" labels=labels,\n",
" include_lowest=True\n",
")\n",
"\n",
"# ============================================================\n",
"# 9. DISTRIBUTION\n",
"# ============================================================\n",
"\n",
"dist = (\n",
" rs[\"rupture_class\"]\n",
" .value_counts(normalize=True)\n",
" .sort_index()\n",
" * 100\n",
").round(1)\n",
"\n",
"# ============================================================\n",
"# 10. DONUT CHART\n",
"# ============================================================\n",
"\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=dist.index,\n",
" values=dist.values,\n",
" hole=0.45,\n",
" textinfo=\"percent\",\n",
" hoverinfo=\"label+percent\"\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Rupture intensity distribution (AUM repaired)\",\n",
" legend=dict(\n",
" orientation=\"h\",\n",
" yanchor=\"top\",\n",
" y=-0.15,\n",
" xanchor=\"center\",\n",
" x=0.5\n",
" ),\n",
" legend_title_text=\"Rupture ratio\"\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "990898ea-ceca-46bb-bfb3-c87bf289d272",
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"df = merged_isin.copy()\n",
"\n",
"# Ajouter année / mois\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# 1. Nombre total de lignes par mois\n",
"total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
"\n",
"# 2. Nombre de ruptures par mois\n",
"ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
"\n",
"# 3. Merge pour obtenir total + ruptures\n",
"ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
"ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
"\n",
"# 4. Proportion (en %)\n",
"ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
"\n",
"# 5. Pivot pour heatmap\n",
"heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
"\n",
"# 6. Plot\n",
"plt.figure(figsize=(14, 7))\n",
"sns.heatmap(\n",
" heatmap_ratio, \n",
" cmap=\"Reds\",\n",
" linewidths=.3,\n",
" linecolor=\"grey\",\n",
" annot=True,\n",
" fmt=\".2%\",\n",
" cbar_kws={'label': 'Proportion de ruptures'}\n",
")\n",
"\n",
"plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
"plt.xlabel(\"Mois\")\n",
"plt.ylabel(\"Année\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d335589-c519-458d-857d-a051813b950b",
"metadata": {},
"outputs": [],
"source": [
"df = merged_isin.copy()\n",
"\n",
"# Ajouter year / month au cas où\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# Merge géographique\n",
"df = df.merge(\n",
" geo[[\"Registrar Account - ID\", \"country\"]],\n",
" on=\"Registrar Account - ID\",\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
"\n",
"# Total des lignes par pays\n",
"total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
"\n",
"# Nombre de ruptures\n",
"rupt_country = (\n",
" df[df[\"rupture_flag\"]]\n",
" .groupby(\"country\")\n",
" .size()\n",
" .reset_index(name=\"ruptures\")\n",
")\n",
"\n",
"# Merge + ratios\n",
"country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
"country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
"country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
"\n",
"# Tri (rupture ratio décroissant)\n",
"country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a45a111-25da-4f5c-9723-c3efd25c906d",
"metadata": {},
"outputs": [],
"source": [
"# On ajoute une colonne en % pour laffichage\n",
"import plotly.express as px\n",
"\n",
"country_stats_plot = country_stats.copy()\n",
"country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
"\n",
"# Tri décroissant par proportion de ruptures\n",
"country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
"\n",
"fig = px.bar(\n",
" country_stats_plot,\n",
" x=\"country\",\n",
" y=\"rupture_ratio\",\n",
" hover_data={\n",
" \"rupture_pct\": ':.2f',\n",
" \"ruptures\": True,\n",
" \"total_obs\": True,\n",
" \"rupture_ratio\": False, # on cache la version décimale\n",
" },\n",
" labels={\n",
" \"country\": \"Pays\",\n",
" \"rupture_ratio\": \"Proportion de ruptures\",\n",
" \"rupture_pct\": \"% de ruptures\",\n",
" \"ruptures\": \"Nb de ruptures\",\n",
" \"total_obs\": \"Nb d'observations\"\n",
" },\n",
" title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
")\n",
"\n",
"# Format en %\n",
"fig.update_yaxes(tickformat=\".1%\")\n",
"\n",
"fig.update_layout(\n",
" xaxis_tickangle=-45,\n",
" bargap=0.2\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4af9841-6cf9-4d27-8096-ac878e866bc6",
"metadata": {},
"outputs": [],
"source": [
"rs = rupture_summary.copy()\n",
"\n",
"# 1. Stats numériques classiques\n",
"print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
"print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
"\n",
"\n",
"# 2. Distribution par classes (bins)\n",
"\n",
"rs[\"rupture_bucket\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
" labels=[\n",
" \"00.1%\",\n",
" \"0.11%\",\n",
" \"15%\",\n",
" \"510%\",\n",
" \"1025%\",\n",
" \"2550%\",\n",
" \"50100%\"\n",
" ],\n",
" include_lowest=True\n",
")\n",
"\n",
"# Ajouter la catégorie \"0%\"\n",
"rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
"\n",
"# Remplacer les 0% exacts\n",
"rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
"\n",
"bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
"print(bucket_counts)\n",
"\n",
"\n",
"# 3. Pourcentages\n",
"bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
"\n",
"print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
"print(bucket_percent)\n",
"\n",
"\n",
"# 4. Nombre de comptes totalement propres\n",
"no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
"print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
"\n",
"# 5. Comptes extrêmement problématiques\n",
"severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
"print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
"\n",
"medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
"print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f39a9a5a-5f4e-4cac-9f63-e6952582b6ff",
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"fig = px.histogram(\n",
" rs,\n",
" x=\"rupture_ratio\",\n",
" nbins=50,\n",
" title=\"Distribution du rupture_ratio\",\n",
" labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
")\n",
"fig.update_layout(bargap=0.05)\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70132995-8379-44b6-8ff6-f09524c4e4d0",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. Filtres de base ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# Filtrer uniquement l'année 2021\n",
"ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
"\n",
"print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
"\n",
"# --- 2. Classification du type de gap ---\n",
"ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
"\n",
"# --- 3. Statistiques globales ---\n",
"gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
"gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
"\n",
"print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
"print(gap_counts)\n",
"print(\"\\n(%)\")\n",
"print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
"\n",
"# --- 4. Intensité des écarts ---\n",
"intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
"print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
"print(intensity_stats)\n",
"\n",
"# --- 5. Visualisation rapide ---\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(10,5))\n",
"sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
"plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
"plt.title(\"Distribution des gaps de rupture en 2021\")\n",
"plt.xlabel(\"Gap (AUM_{t} Expected AUM_{t})\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1faf943a-4703-4b19-a867-2670ac3a5209",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. ADD YEAR ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# --- 2. DEFINE PERIODS ---\n",
"conditions = [\n",
" merged[\"year\"] < 2021,\n",
" merged[\"year\"] == 2021,\n",
" merged[\"year\"] > 2021\n",
"]\n",
"\n",
"period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
"\n",
"merged[\"period\"] = np.select(\n",
" conditions,\n",
" period_labels,\n",
" default=\"unknown\"\n",
")\n",
"\n",
"# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
"merged[\"gap_type\"] = np.where(\n",
" merged[\"gap\"] > 0, \"positive\",\n",
" np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 4. TOTAL OBS PER PERIOD ---\n",
"total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
"\n",
"# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
"rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
"\n",
"# --- 6. PROPORTION OF RUPTURES ---\n",
"rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
"\n",
"# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
"gap_dist = (\n",
" ruptures.groupby([\"period\", \"gap_type\"])\n",
" .size()\n",
" .groupby(level=0)\n",
" .apply(lambda x: (x / x.sum()) * 100) # % par période\n",
")\n",
"\n",
"\n",
"# --- 8. MERGE AND DISPLAY ---\n",
"summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
"summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
"\n",
"print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
"print(summary)\n",
"\n",
"print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
"print(gap_dist)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5abee764-b890-4ea1-8f98-5a0ff1512611",
"metadata": {},
"outputs": [],
"source": [
"from plotly.subplots import make_subplots\n",
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. DEFINE PERIODS ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 2. Ensure gap_type exists + no missing categories ---\n",
"ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"}) # zero is equivalent to no-flow change\n",
"\n",
"# --- 3. Compute gap counts ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 4. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 5. MAKE TWO DONUT CHARTS ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
" showlegend=True\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3aa3b8a0-f499-495a-9171-2e09d0bb1e5f",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Compute gap counts by period ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 2. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before 2021\", \"After 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4f0dc74-649d-4105-9a1a-44a18d126a3c",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Define periods ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"# --- 2. Keep only ruptures ---\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 3. Count ruptures per period ---\n",
"rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
" [\"Before Sep 2021\", \"After Sep 2021\"]\n",
").fillna(0)\n",
"\n",
"# --- 4. Pie chart ---\n",
"fig = go.Figure(data=[\n",
" go.Pie(\n",
" labels=rupture_counts.index,\n",
" values=rupture_counts.values,\n",
" hole=0.45,\n",
" marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
" textinfo=\"percent+value\",\n",
" )\n",
"])\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecccd73c-00a6-4ff3-b213-e85b98ec5a55",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# 1. Filtre sur la période post-Sept 2021\n",
"cutoff = pd.Timestamp(\"2021-09-01\")\n",
"post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
"\n",
"# 2. On ne garde que les ruptures\n",
"post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
"\n",
"# 3. Gap absolu + gap relatif (% du stock)\n",
"post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
"post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
"\n",
"# 4. Percentiles globaux\n",
"p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
"p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
"p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
"\n",
"# 5. Classification automatique\n",
"def classify_gap(gap, gap_rel, acct):\n",
" # RESET → énorme choc (technique)\n",
" if gap_abs >= p99 or gap_rel >= 0.90:\n",
" return \"reset\"\n",
"\n",
" # SPIKE → très gros gap mais isolé\n",
" if gap_abs >= p95:\n",
" return \"spike\"\n",
"\n",
" # SHIFT → décalage permanent\n",
" # Test : moyenne des gaps du compte\n",
" return None\n",
"\n",
"# Calcul du shift (décalage directionnel)\n",
"shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
"\n",
"post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"post_rupt[\"gap_type2\"] = np.where(\n",
" post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
" np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
" np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
" \n",
"# 6. Statistiques globales\n",
"stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
"print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
"print(stats)\n",
"\n",
"# 7. Stats par client\n",
"client_stats = (\n",
" post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"# 8. Stats par ISIN\n",
"isin_stats = (\n",
" post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
"print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
"\n",
"print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
"print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2efc5e0-bc35-4fa7-ab5d-6be616964446",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- Data from your output ---\n",
"labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
"values = [50.4, 44.6, 4.0, 1.0]\n",
"\n",
"# --- Pie chart ---\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=labels,\n",
" values=values,\n",
" hole=0.35, # donut style (plus lisible)\n",
" textinfo='percent',\n",
" marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Typologie des ruptures depuis Septembre 2021\",\n",
" legend_title=\"Type de gap\",\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "744e04b6-3f34-40c9-95fe-a5605e7c7f02",
"metadata": {},
"outputs": [],
"source": [
"merged[\"gap_abs\"] = merged[\"gap\"].abs()\n",
"\n",
"merged[\"gap_rel\"] = (\n",
" merged[\"gap_abs\"] /\n",
" merged[\"Quantity - AUM\"].replace(0, np.nan)\n",
")\n",
"\n",
"merged.loc[merged[\"rupture_flag\"], \"gap_rel\"].describe(\n",
" percentiles=[0.5, 0.75, 0.9, 0.95, 0.99]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d20625e-1045-4b7a-ab64-3381997e4131",
"metadata": {},
"outputs": [],
"source": [
"# uniquement sur les ruptures\n",
"df_r = merged[merged[\"rupture_flag\"]].copy()\n",
"\n",
"# seuils globaux (descriptifs, pas \"optimisés\")\n",
"q90 = df_r[\"gap_abs\"].quantile(0.90)\n",
"q99 = df_r[\"gap_abs\"].quantile(0.99)\n",
"\n",
"# moyenne directionnelle par compte\n",
"avg_gap_by_account = (\n",
" df_r.groupby(\"Registrar Account - ID\")[\"gap\"]\n",
" .mean()\n",
" .rename(\"avg_gap\")\n",
")\n",
"\n",
"df_r = df_r.merge(avg_gap_by_account, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"def classify_gap(row):\n",
" if row[\"gap_abs\"] >= q99:\n",
" return \"reset\"\n",
" if row[\"gap_abs\"] >= q90:\n",
" return \"spike\"\n",
" if abs(row[\"avg_gap\"]) > row[\"gap_abs\"]:\n",
" return \"shift\"\n",
" return \"micro\"\n",
"\n",
"df_r[\"discontinuity_type\"] = df_r.apply(classify_gap, axis=1)\n",
"df_r[\"discontinuity_type\"].value_counts(normalize=True) * 100\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02806629-e454-4e10-82be-6e2239091088",
"metadata": {},
"outputs": [],
"source": [
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"yearly_stats = merged.groupby(\"year\").agg(\n",
" total_obs=(\"gap\", \"count\"),\n",
" ruptures=(\"rupture_flag\", \"sum\")\n",
").reset_index()\n",
"\n",
"yearly_stats[\"rupture_rate\"] = (\n",
" yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2edf2c55-45e7-4aad-b4f9-5c35178abad6",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"df_r = merged[merged[\"rupture_flag\"]].copy()\n",
"\n",
"plt.figure(figsize=(12,4))\n",
"plt.hist(df_r[\"gap_abs\"], bins=100, log=True)\n",
"plt.title(\"Distribution of absolute gaps (log scale)\")\n",
"plt.xlabel(\"Absolute gap\")\n",
"plt.ylabel(\"Frequency (log)\")\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(12,4))\n",
"plt.hist(df_r[\"gap_rel\"].dropna(), bins=100, log=True)\n",
"plt.title(\"Distribution of relative gaps (|gap| / AUM)\")\n",
"plt.xlabel(\"Relative gap\")\n",
"plt.ylabel(\"Frequency (log)\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "981f2ec6-574b-41ea-b4bf-45be54aeda1f",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10,4))\n",
"plt.plot(yearly_stats[\"year\"], yearly_stats[\"rupture_rate\"], marker=\"o\")\n",
"plt.title(\"Evolution of AUMFlow inconsistency rate over time\")\n",
"plt.xlabel(\"Year\")\n",
"plt.ylabel(\"Rupture rate\")\n",
"plt.grid(True)\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}