Project_Carmignac/analyse_rupture.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "338730e2-a6de-4d4f-b438-efe3feb139ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cfd11919-0941-400e-a516-72871881f733",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1940519970.py:1: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  stocks=pd.read_csv('stocks.csv')\n",
      "/tmp/ipykernel_1311/1940519970.py:2: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  flows = pd.read_csv('flows.csv')\n"
     ]
    }
   ],
   "source": [
    "stocks=pd.read_csv('stocks.csv')\n",
    "flows = pd.read_csv('flows.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b99e3402-fe26-4f4e-8c1c-5f07847bce94",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/3613746644.py:1: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  merged = pd.read_csv('merged.csv')\n"
     ]
    }
   ],
   "source": [
    "merged = pd.read_csv('merged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "34e5a815-7269-4312-bfe6-e2cd12595e57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Prepare stock dataset ISIN-by-ISIN\n",
    "stocks_isin = stocks[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
    "\n",
    "stocks_isin = stocks_isin.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# 2. Prepare flows dataset ISIN-by-ISIN\n",
    "flows_isin = flows[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
    "\n",
    "flows_isin = (\n",
    "    flows_isin\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# 3. Merge stocks & flows ISIN-by-ISIN\n",
    "merged_isin = stocks_isin.merge(\n",
    "    flows_isin,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# 4. Compute expected stock per ISIN for each account\n",
    "merged_isin[\"prev_stock\"] = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - AUM\"]\n",
    "    .shift(1)\n",
    ")\n",
    "\n",
    "merged_isin[\"prev_netflows\"] = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - NetFlows\"]\n",
    "    .shift(1)\n",
    "    .fillna(0)\n",
    ")\n",
    "\n",
    "merged_isin[\"expected_stock\"] = (\n",
    "    merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
    ")\n",
    "\n",
    "# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
    "TOL = 1e-6\n",
    "\n",
    "merged_isin[\"gap\"] = (\n",
    "    merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
    ")\n",
    "\n",
    "merged_isin[\"rupture_flag\"] = (\n",
    "    merged_isin[\"prev_stock\"].notna()\n",
    "    & (merged_isin[\"gap\"].abs() > TOL)\n",
    ")\n",
    "\n",
    "# 6. Summarize ruptures per (Account, ISIN)\n",
    "rupture_isin_summary = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap\", lambda x: x.abs().max())\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# Sort by worst ISIN trajectories\n",
    "rupture_isin_summary = rupture_isin_summary.sort_values(\n",
    "    \"rupture_ratio\",\n",
    "    ascending=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16213cb2-07d8-4e82-b9bb-252554ec47b9",
   "metadata": {},
   "source": [
    "# Détection des ruptures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "78c3db70-e0b6-4de2-92ca-e29cf5bf6bd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ============================================================\n",
    "# AUM–FLOW CONSISTENCY & RUPTURE DETECTION (FINAL VERSION)\n",
    "# ============================================================\n",
    "# ------------------------------------------------------------\n",
    "# 1. Keep relevant columns\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean = stocks[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "flows_clean = flows[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Date formatting\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
    "flows_clean[\"Centralisation Date\"]  = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Aggregate flows per day\n",
    "# ------------------------------------------------------------\n",
    "flows_clean = (\n",
    "    flows_clean\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 4. Merge stocks and flows\n",
    "# ------------------------------------------------------------\n",
    "df = stocks_clean.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 5. Sort and compute expected stock\n",
    "# ------------------------------------------------------------\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"prev_stock\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df[\"prev_flows\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 6. Compute gaps\n",
    "# ------------------------------------------------------------\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "df[\"gap_rel\"] = df[\"gap_abs\"] / df[\"expected_stock\"].abs().clip(lower=1)\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 7. Detect ruptures (economic rule)\n",
    "# ------------------------------------------------------------\n",
    "TAU_ABS = 10.0     # minimum absolute gap (shares)\n",
    "TAU_REL = 0.005    # minimum relative gap (0.5%)\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_stock\"].notna()\n",
    "    & (df[\"gap_abs\"] > TAU_ABS)\n",
    "    & (df[\"gap_rel\"] > TAU_REL)\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 8. Remove end-of-sample false positives (edge effects)\n",
    "# ------------------------------------------------------------\n",
    "last_date = df[\"Centralisation Date\"].max()\n",
    "\n",
    "df[\"rupture_flag\"] = np.where(\n",
    "    (df[\"rupture_flag\"]) & (df[\"Centralisation Date\"] == last_date),\n",
    "    False,\n",
    "    df[\"rupture_flag\"]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a9783dc1-e225-4142-8b6f-6f9e620b4b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 9. ISIN-level summary (AFTER CLEANING)\n",
    "# ------------------------------------------------------------\n",
    "rupture_isin_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        total_obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap_abs\", \"max\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 10. Account-level summary (AFTER CLEANING)\n",
    "# ------------------------------------------------------------\n",
    "rupture_summary = (\n",
    "    df.groupby(\"Registrar Account - ID\")\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        total_obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap_abs\", \"max\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 11. Outputs\n",
    "# ------------------------------------------------------------\n",
    "df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
    "rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
    "rupture_summary.to_csv(\"rupture_summary.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f5b62558-c27a-4428-a193-8b97e0ce6b6a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "hole": 0.45,
         "hoverinfo": "label+percent",
         "labels": [
          "Clean / quasi-clean (≤1%)",
          "Moderate (1–10%)",
          "High (10–30%)",
          "Severe (>30%)"
         ],
         "textinfo": "percent",
         "type": "pie",
         "values": {
          "bdata": "AAAAAACASEAAAAAAAIBBQAAAAAAAAChAZmZmZmZmEEA=",
          "dtype": "f8"
         }
        }
       ],
       "layout": {
        "legend": {
         "orientation": "h",
         "title": {
          "text": "Rupture ratio"
         },
         "x": 0.5,
         "xanchor": "center",
         "y": -0.15,
         "yanchor": "top"
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermap": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermap"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        }
       }
      },
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAzkAAAFoCAYAAAB0XzViAAAQAElEQVR4AezdB5wcdf3/8c+Wy6WTTggtARJ670QhoFQTqqEjKCCISFVKUIyKoXdEEFD8I/ATREFKaAIWeu+B0CEhCQTSLneX2/Lf94Q5Jpu9u929LVNePPhmdme+853v9/md3Z3PfGfm4ln+QwABBBBAAAEEEEAAAQRCJBA3/kMAgQICzEIAAQQQQAABBBAIqgBBTlB7jnojgAAC9RBgmwgggAACCARAgCAnAJ1EFRFAAAEEEEDA3wLUDgEE/CVAkOOv/qA2CCCAAAIIIIAAAgiERaBu7SDIqRs9G0YAAQQQQAABBBBAAIFqCBDkVEOVMisnQEkIIIAAAggggAACCJQoQJBTIhjZEUAAAT8IUAcEEEAAAQQQ6FiAIKdjG5YggAACCCCAQLAEqC0CCCDgCBDkOAz8gwACCCCAAAIIIIBAWAWi1y6CnOj1OS1GAAEEEEAAAQQQQCDUAgQ5oe7eyjWOkhBAAAEEEEAAAQQQCIoAQU5Qeop6IoCAHwWoEwIIIIAAAgj4UIAgx4edQpUQQAABBBAItgC1RwABBOorQJBTX3+2jgACCCCAAAIIIBAVAdpZMwGCnJpRsyEEEEAAAQQQQAABBBCohQBBTi2UK7cNSkIAAQQQQAABBBBAAIEuBAhyugBiMQIIBEGAOiKAAAIIIIAAAl8LEOR8bcErBBBAAAEEwiVAaxBAAIGIChDkRLTjaTYCCCCAAAIIIBBVAdodfgGCnPD3MS1EAAEEEEAAAQQQQCBSAgQ5ZXU3KyGAAAIIIIAAAggggIBfBQhy/Noz1AuBIApQZwQQQAABBBBAwAcCBDk+6ASqgAACCCAQbgFahwACCCBQWwGCnNp6szUEEEAAAQQQQACBpQL8i0DVBAhyqkZLwQgggAACCCCAAAIIIFAPgWAHOfUQY5sIIIAAAggggAACCCDgawGCHF93D5VDoDwB1kIAAQQQQAABBKIsQJAT5d6n7QgggEC0BGgtAggggEBEBAhyItLRNBMBBBBAAAEEECgswFwEwidAkBO+PqVFCCCAAAIIIIAAAghEWqAiQU6kBWk8AggggAACCCCAAAII+EqAIMdX3UFlQiZAcxBAAAEEEEAAAQTqIECQUwd0NokAAghEW4DWI4AAAgggUF0Bgpzq+lI6AggggAACCCBQnAC5EECgYgIEORWjpCAEEEAAAQQQQAABBBCotEA55RHklKPGOggggAACCCCAAAIIIOBbAYIc33YNFaucACUhgAACCCCAAAIIREmAICdKvU1bEUAAAa8ArxFAAAEEEAipAEFOSDuWZiGAAAIIIIBAeQKshQACwRcgyAl+H9ICBBBAAAEEEEAAAQSqLRCo8glyAtVdVBYBBBBAAAEEEEAAAQS6EiDI6UqI5ZUToCQEEEAAAQQQQAABBGogQJBTA2Q2gQACCHQmwDIEEEAAAQQQqKwAQU5lPSkNAQQQQAABBCojQCkIIIBA2QIEOWXTsSICCCCAAAIIIIAAArUWYHvFCBDkFKNEHgQQQAABBBBAAAEEEAiMAEFOYLqqchWlJAQQQAABBBBAAAEEwixAkBPm3qVtCCBQigB5EUAAAQQQQCAkAgQ5IelImoEAAggggEB1BCgVAQQQCJ4AQU7w+owaI4AAAggggAACCNRbgO37WoAgx9fdQ+UQQAABBBBAAAEEEECgVAGCnFLFKpefkhBAAAEEEEAAAQQQQKAKAgQ5VUClSAQQ6I4A6yKAAAIIIIAAAt0TIMjpnh9rI4AAAgggUBsBtoIAAgggULQAQU7RVGREAAEEEEAAAQQQ8JsA9UGgkABBTiEV5iGAAAIIIIAAAggggEBgBQhyLLB9R8URQAABBBBAAAEEEECggABBTgEUZiGAgJmBgAACCCCAAAIIBFSAICegHUe1EUAAAQTqI8BWEUAAAQT8L0CQ4/8+ooYIIIAAAggggIDfBagfAr4SIMjxVXdQGQQQQAABBBBAAAEEEOiugH+CnO62hPURQAABBBBAAAEEEEAAgZwAQU4Ogf8R8LMAdUMAAQQQQAABBBAoTYAgpzQvciOAAAII+EOAWiCAAAIIINChAEFOhzQsQAABBBBAAAEEgiZAfRFAQAIEOVIgIYAAAggggAACCCCAQGgElgtyQtMyGoIAAggggAACCCCAAAKRFCDIiWS30+gyBFgFAQQQQAABBBBAICACBDkB6SiqiQACCPhTgFohgAACCCDgPwGCHP/1CTVCAAEEEEAAgaALUH8EEKirAEFOXfnZOAIIIIAAAggggAAC0RGoVUsJcmolzXYQQAABBBBAAAEEEECgJgIEOTVhZiOVE6AkBBBAAAEEEEAAAQQ6FyDI6dyHpQgggEAwBKglAggggAACCLQLEOS0U/ACAQQQQAABBMImQHsQQCCaAgQ50ex3Wo0AAggggAACCCAQXYHQt5wgJ/RdTAMRQAABBBBAAAEEEIiWAEFOtPq7cq2lJAQQQAABBBBAAAEEfCpAkOPTjqFaCCAQTAFqjQACCCCAAAL1FyDIqX8fUAMEEEAAAQTCLkD7EEAAgZoKEOTUlJuNIYAAAggggAACCCDgCjCtlgBBTrVkKRcBBBBAAAEEEEAAAQTqIkCQUxf2ym2UkhBAAAEEEEAAAQQQQGBZAYKcZT14hwAC4RCgFQgggAACCCAQYQGCnAh3Pk1HAAEEEIiaAO1FAAEEoiFAkBONfqaVCCCAAAIIIIAAAh0JMD90AgQ5oetSGoQAAggggAACCCCAQLQFCHIq0/+UggACCCCAAAIIIIAAAj4RIMjxSUdQDQTCKUCrEEAAAQQQQACB2gsQ5NTenC0igAACCERdgPYjgAACCFRVgCCnqrwUjgACCCCAAAIIIFCsAPkQqJQAQU6lJCkHAQQQQAABBBBAAAEEfCEQsiDHF6ZUAgEEEEAAAQQQQAABBOooQJBTR3w2jUDNBNgQAggggAACCCAQIQGCnAh1Nk1FAAEEEFhWgHcIIIAAAuEUIMgJZ7/SKgQQQAABBBBAoFwB1kMg8AIEOYHvQhqAAAIIIIAAAggggAACXoHqBDneLfAaAQQQQAABBBBAAAEEEKihAEFODbHZFAIIIIAAAggggAACCFRfgCCn+sZsAQEEEAilQPqtV63t2f/aksfus9Z7b7OW226w5hsvt8W/+601XXCGLZr8E1v4syNswXH72bwjdrWXzv2jHX9am/3sl232y/NSdt7lKbv82pRd9//SdvPtabvzvrQ98p+Mvfxa1j6ZmbXWJaFko1EIIIAAAjUQiNdgG2wCAQQQQCDIAtmsZT5535Y8eq8tvu4iW3ja923egTvYwl/8yJouPNMWXz3Fmv98hbX87U/Wet/ttuTfU63tuf9Z6o0XLf3hO5b5fLbZ4iaLZTPW0mr25TyzGZ9m7Z33svbqG1l7+vmMPfq/jN3zQMZuuSNtV16Xssnnp+zHP2uzn5ze5ry+8g8pu+VvaXvs8Yx9misuyJzUHYHiBciJAALlCsTLXZH1EEAAAQTCKZBdMM8ZoWm59Vpb9OsTnVGYBaccZot/f64teehOS38w3SyTrknjm1vMGdV5+fWsPfLfjP3ltrT9YkqbnXxWm13zp5Qzb2YuYKpJZdgIAggggIA/BIqoBUFOEUhkQQABBMIukHrtBVt8zXk2/0f72vyjxjsjNC3/uMlSrz1v1rzYd81fuMjsuZeyzujO2eel7MQz2+x3N6ScEaEm/1XXd35UCAEEEAi7AEFO2HuY9hUSYB4CCOQEdClZ81+utvnH7pMbsTnBljxyj2XnzsktCd7/CmxefCXr3NtzSm6UR/f6PPVcxlpbg9cWaowAAggg0H0BgpzuG1ICAgggEBiB7OezreXOm2zBqYc5DwVo/ectlv3is6/qH45JOmPOvT7X35S2k3IBzzU3pk0BUKo2V9iFA5FWIIAAAgEXIMgJeAdSfQQQQKArgeyihc69NIvOPs7mH7eftdxyrWU+fr+r1UKxvK3N7LkXM86lbCed2WY3/y1tc78IRdNoRC0F2BYCCAROgCAncF1GhRFAAIH
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Base\n",
    "rs = rupture_summary.copy()\n",
    "\n",
    "# Classes simplifiées\n",
    "bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rs[\"rupture_class\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# Distribution en %\n",
    "dist = (\n",
    "    rs[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# Donut chart\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=dist.index,\n",
    "        values=dist.values,\n",
    "        hole=0.45,\n",
    "        textinfo=\"percent\",\n",
    "        hoverinfo=\"label+percent\"\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    legend=dict(\n",
    "        orientation=\"h\",      # horizontale\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,              # en dessous du graphe\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e52cd650-df05-490d-af59-e66c058f955d",
   "metadata": {},
   "source": [
    "## AUM–FLOW CONSISTENCY & DISCONTINUITY DETECTION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a7efe494-f5fa-43f8-8446-942fc2d3bd4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Detection threshold epsilon (trimmed 99th percentile): 40.03%\n"
     ]
    }
   ],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 1. Keep relevant columns\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean = stocks[\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - AUM\"]\n",
    "].copy()\n",
    "\n",
    "flows_clean = flows[\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - NetFlows\"]\n",
    "].copy()\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Date formatting\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
    "flows_clean[\"Centralisation Date\"]  = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Aggregate flows per day\n",
    "# ------------------------------------------------------------\n",
    "flows_clean = (\n",
    "    flows_clean\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 4. Merge stocks and flows\n",
    "# ------------------------------------------------------------\n",
    "df = stocks_clean.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 5. Sort and reconstruct expected stock\n",
    "# ------------------------------------------------------------\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"prev_stock\"] = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      [\"Quantity - AUM\"]\n",
    "      .shift(1)\n",
    ")\n",
    "\n",
    "df[\"prev_flows\"] = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      [\"Quantity - NetFlows\"]\n",
    "      .shift(1)\n",
    "      .fillna(0)\n",
    ")\n",
    "\n",
    "df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 6. Compute accounting gaps\n",
    "# ------------------------------------------------------------\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "# Relative gap normalised by previous stock\n",
    "df[\"gap_rel\"] = (\n",
    "    df[\"gap_abs\"] /\n",
    "    df[\"prev_stock\"].abs().replace(0, np.nan)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 7. Calibration sample (valid regime)\n",
    "# ------------------------------------------------------------\n",
    "valid_gaps = df.loc[\n",
    "    df[\"gap_rel\"].notna() & (df[\"prev_stock\"] > 0),\n",
    "    \"gap_rel\"\n",
    "]\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 8. Robust, data-driven threshold (epsilon)\n",
    "# ------------------------------------------------------------\n",
    "# Step 1 — trim extreme breaks to avoid calibrating on resets\n",
    "gap_rel_trimmed = valid_gaps[\n",
    "    valid_gaps <= valid_gaps.quantile(0.90)\n",
    "]\n",
    "\n",
    "# Step 2 — define epsilon on the upper tail of the trimmed distribution\n",
    "EPSILON = gap_rel_trimmed.quantile(0.99)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 9. Detect discontinuities (diagnostic rule)\n",
    "# ------------------------------------------------------------\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_stock\"].notna()\n",
    "    & (df[\"prev_stock\"] > 0)\n",
    "    & (df[\"gap_rel\"] > EPSILON)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 10. Remove end-of-sample edge effects\n",
    "# ------------------------------------------------------------\n",
    "last_date = df[\"Centralisation Date\"].max()\n",
    "\n",
    "df.loc[\n",
    "    (df[\"rupture_flag\"]) &\n",
    "    (df[\"Centralisation Date\"] == last_date),\n",
    "    \"rupture_flag\"\n",
    "] = False\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 11. ISIN-level summary\n",
    "# ------------------------------------------------------------\n",
    "rupture_isin_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      .agg(\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "          max_gap_abs=(\"gap_abs\", \"max\"),\n",
    "          max_gap_rel=(\"gap_rel\", \"max\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 12. Account-level summary\n",
    "# ------------------------------------------------------------\n",
    "rupture_summary = (\n",
    "    df.groupby(\"Registrar Account - ID\")\n",
    "      .agg(\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "          max_gap_abs=(\"gap_abs\", \"max\"),\n",
    "          max_gap_rel=(\"gap_rel\", \"max\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 13. Outputs\n",
    "# ------------------------------------------------------------\n",
    "df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
    "rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
    "rupture_summary.to_csv(\"rupture_summary.csv\", index=False)\n",
    "\n",
    "print(f\"Detection threshold epsilon (trimmed 99th percentile): {EPSILON:.2%}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d7454212-1493-4715-a436-c331931f92fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Registrar Account - ID</th>\n",
       "      <th>Product - Isin</th>\n",
       "      <th>n_ruptures</th>\n",
       "      <th>total_obs</th>\n",
       "      <th>rupture_ratio</th>\n",
       "      <th>max_gap_abs</th>\n",
       "      <th>max_gap_rel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>59545</th>\n",
       "      <td>200127410</td>\n",
       "      <td>FR0010135103</td>\n",
       "      <td>384</td>\n",
       "      <td>436</td>\n",
       "      <td>0.880734</td>\n",
       "      <td>295985.42</td>\n",
       "      <td>3371.158214</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Registrar Account - ID Product - Isin  n_ruptures  total_obs  \\\n",
       "59545              200127410   FR0010135103         384        436   \n",
       "\n",
       "       rupture_ratio  max_gap_abs  max_gap_rel  \n",
       "59545       0.880734    295985.42  3371.158214  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rupture_isin_summary.sort_values(\"rupture_ratio\").head(1)\n",
    "rupture_isin_summary.sort_values(\"rupture_ratio\", ascending=False).head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b4040847-e0cf-4aa5-966c-d1fbf3935b7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_isin_evolution(df, account_id, isin, title_suffix=\"\"):\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == account_id) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].copy()\n",
    "\n",
    "    if sub.empty:\n",
    "        print(\"No data for this (account, ISIN).\")\n",
    "        return\n",
    "\n",
    "    plt.figure(figsize=(10,4))\n",
    "\n",
    "    # Stock observé\n",
    "    plt.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"Quantity - AUM\"],\n",
    "        label=\"Observed stock\",\n",
    "        linewidth=2\n",
    "    )\n",
    "\n",
    "    # Stock attendu\n",
    "    plt.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"expected_stock\"],\n",
    "        label=\"Expected stock\",\n",
    "        linestyle=\"--\"\n",
    "    )\n",
    "\n",
    "    # Ruptures\n",
    "    rupt = sub[sub[\"rupture_flag\"]]\n",
    "    plt.scatter(\n",
    "        rupt[\"Centralisation Date\"],\n",
    "        rupt[\"Quantity - AUM\"],\n",
    "        color=\"red\",\n",
    "        label=\"Rupture\",\n",
    "        zorder=5\n",
    "    )\n",
    "\n",
    "    plt.title(f\"ISIN {isin} — Account {account_id} {title_suffix}\")\n",
    "    plt.xlabel(\"Date\")\n",
    "    plt.ylabel(\"AUM (shares)\")\n",
    "    plt.legend()\n",
    "    plt.grid(True)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e5d7a5ab-40bd-452d-a6ae-d56e220c592f",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'plot_isin_dynamics' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 63\u001b[39m\n\u001b[32m     58\u001b[39m     plt.show()\n\u001b[32m     62\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m _, row \u001b[38;5;129;01min\u001b[39;00m sample_isin.iterrows():\n\u001b[32m---> \u001b[39m\u001b[32m63\u001b[39m     \u001b[43mplot_isin_dynamics\u001b[49m(\n\u001b[32m     64\u001b[39m         df,\n\u001b[32m     65\u001b[39m         row[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m     66\u001b[39m         row[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     67\u001b[39m     )\n",
      "\u001b[31mNameError\u001b[39m: name 'plot_isin_dynamics' is not defined"
     ]
    }
   ],
   "source": [
    "# Option B (alternative) : les plus sévères\n",
    "# sample_isin = problematic_isin.sort_values(\n",
    "#     \"rupture_ratio\", ascending=False\n",
    "# ).head(10)\n",
    "\n",
    "sample_isin = rupture_isin_summary.sort_values(\n",
    "    \"rupture_ratio\",\n",
    "    ascending=False\n",
    ").head(10)\n",
    "\n",
    "def plot_isin_dynamics_clean(df, account_id, isin):\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == account_id) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].sort_values(\"Centralisation Date\")\n",
    "\n",
    "    if sub.empty:\n",
    "        return\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(7.5, 3))\n",
    "\n",
    "    # AUM observé\n",
    "    ax.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"Quantity - AUM\"],\n",
    "        label=\"Observed AUM\",\n",
    "        linewidth=2,\n",
    "        color=\"black\"\n",
    "    )\n",
    "\n",
    "    # AUM attendu\n",
    "    ax.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"expected_stock\"],\n",
    "        label=\"Flow-implied AUM\",\n",
    "        linestyle=\"--\",\n",
    "        linewidth=2,\n",
    "        color=\"grey\"\n",
    "    )\n",
    "\n",
    "    # Ruptures\n",
    "    rupt = sub[sub[\"rupture_flag\"]]\n",
    "    ax.scatter(\n",
    "        rupt[\"Centralisation Date\"],\n",
    "        rupt[\"Quantity - AUM\"],\n",
    "        color=\"red\",\n",
    "        s=25,\n",
    "        zorder=5,\n",
    "        label=\"Discontinuity\"\n",
    "    )\n",
    "\n",
    "    ax.set_title(f\"Account {account_id} — ISIN {isin}\", fontsize=11)\n",
    "    ax.set_xlabel(\"\")\n",
    "    ax.set_ylabel(\"AUM (shares)\")\n",
    "    ax.legend(loc=\"best\")\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "\n",
    "for _, row in sample_isin.iterrows():\n",
    "    plot_isin_dynamics(\n",
    "        df,\n",
    "        row[\"Registrar Account - ID\"],\n",
    "        row[\"Product - Isin\"]\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "aef8ceb9-28a6-4908-ae24-a88d85b64309",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"Column(s) ['rupture_flag'] do not exist\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;66;03m# 1. Aggregate rupture rate over time\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      4\u001b[39m time_stats = (\n\u001b[32m      5\u001b[39m     \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43m      \u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      7\u001b[39m \u001b[43m          \u001b[49m\u001b[43mtotal_obs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcount\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[43m          \u001b[49m\u001b[43mn_ruptures\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      9\u001b[39m \u001b[43m      \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     10\u001b[39m       .reset_index()\n\u001b[32m     11\u001b[39m )\n\u001b[32m     13\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mrupture_rate\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m     14\u001b[39m     time_stats[\u001b[33m\"\u001b[39m\u001b[33mn_ruptures\u001b[39m\u001b[33m\"\u001b[39m] / time_stats[\u001b[33m\"\u001b[39m\u001b[33mtotal_obs\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     15\u001b[39m )\n\u001b[32m     17\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     18\u001b[39m \u001b[38;5;66;03m# 2. Smooth (optional but recommended for readability)\u001b[39;00m\n\u001b[32m     19\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001b[39m, in \u001b[36mDataFrameGroupBy.aggregate\u001b[39m\u001b[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[39m\n\u001b[32m   1429\u001b[39m     kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m] = engine_kwargs\n\u001b[32m   1431\u001b[39m op = GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args=args, kwargs=kwargs)\n\u001b[32m-> \u001b[39m\u001b[32m1432\u001b[39m result = \u001b[43mop\u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1433\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1434\u001b[39m     \u001b[38;5;66;03m# GH #52849\u001b[39;00m\n\u001b[32m   1435\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.as_index \u001b[38;5;129;01mand\u001b[39;00m is_list_like(func):\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:190\u001b[39m, in \u001b[36mApply.agg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    187\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_str()\n\u001b[32m    189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(func):\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    191\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(func):\n\u001b[32m    192\u001b[39m     \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[32m    193\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.agg_list_like()\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:423\u001b[39m, in \u001b[36mApply.agg_dict_like\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    415\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magg_dict_like\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> DataFrame | Series:\n\u001b[32m    416\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    417\u001b[39m \u001b[33;03m    Compute aggregation in the case of a dict-like argument.\u001b[39;00m\n\u001b[32m    418\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m    421\u001b[39m \u001b[33;03m    Result of aggregation.\u001b[39;00m\n\u001b[32m    422\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m423\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_or_apply_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43magg\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001b[39m, in \u001b[36mGroupByApply.agg_or_apply_dict_like\u001b[39m\u001b[34m(self, op_name)\u001b[39m\n\u001b[32m   1598\u001b[39m     kwargs.update({\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m: engine, \u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m: engine_kwargs})\n\u001b[32m   1600\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m com.temp_setattr(\n\u001b[32m   1601\u001b[39m     obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m, condition=\u001b[38;5;28mhasattr\u001b[39m(obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   1602\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1603\u001b[39m     result_index, result_data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompute_dict_like\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1604\u001b[39m \u001b[43m        \u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m   1605\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1606\u001b[39m result = \u001b[38;5;28mself\u001b[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001b[32m   1607\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:462\u001b[39m, in \u001b[36mApply.compute_dict_like\u001b[39m\u001b[34m(self, op_name, selected_obj, selection, kwargs)\u001b[39m\n\u001b[32m    460\u001b[39m is_groupby = \u001b[38;5;28misinstance\u001b[39m(obj, (DataFrameGroupBy, SeriesGroupBy))\n\u001b[32m    461\u001b[39m func = cast(AggFuncTypeDict, \u001b[38;5;28mself\u001b[39m.func)\n\u001b[32m--> \u001b[39m\u001b[32m462\u001b[39m func = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    464\u001b[39m is_non_unique_col = (\n\u001b[32m    465\u001b[39m     selected_obj.ndim == \u001b[32m2\u001b[39m\n\u001b[32m    466\u001b[39m     \u001b[38;5;129;01mand\u001b[39;00m selected_obj.columns.nunique() < \u001b[38;5;28mlen\u001b[39m(selected_obj.columns)\n\u001b[32m    467\u001b[39m )\n\u001b[32m    469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m selected_obj.ndim == \u001b[32m1\u001b[39m:\n\u001b[32m    470\u001b[39m     \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:663\u001b[39m, in \u001b[36mApply.normalize_dictlike_arg\u001b[39m\u001b[34m(self, how, obj, func)\u001b[39m\n\u001b[32m    661\u001b[39m     cols = Index(\u001b[38;5;28mlist\u001b[39m(func.keys())).difference(obj.columns, sort=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m    662\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m663\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m do not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m    665\u001b[39m aggregator_types = (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[32m    667\u001b[39m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[32m    668\u001b[39m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[32m    669\u001b[39m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[32m    670\u001b[39m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
      "\u001b[31mKeyError\u001b[39m: \"Column(s) ['rupture_flag'] do not exist\""
     ]
    }
   ],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 1. Aggregate rupture rate over time\n",
    "# ------------------------------------------------------------\n",
    "time_stats = (\n",
    "    df.groupby(\"Centralisation Date\")\n",
    "      .agg(\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "time_stats[\"rupture_rate\"] = (\n",
    "    time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Smooth (optional but recommended for readability)\n",
    "# ------------------------------------------------------------\n",
    "time_stats[\"rupture_rate_ma\"] = (\n",
    "    time_stats[\"rupture_rate\"]\n",
    "    .rolling(window=6, center=True)   # 6 periods ≈ half-year\n",
    "    .mean()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Professional plot\n",
    "# ------------------------------------------------------------\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate\"] * 100,\n",
    "    color=\"lightgray\",\n",
    "    linewidth=1,\n",
    "    alpha=0.6,\n",
    "    label=\"Monthly rupture rate\"\n",
    ")\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate_ma\"] * 100,\n",
    "    color=\"#1f77b4\",\n",
    "    linewidth=2.5,\n",
    "    label=\"6-month moving average\"\n",
    ")\n",
    "\n",
    "plt.ylabel(\"Rupture rate (%)\")\n",
    "plt.xlabel(\"Date\")\n",
    "\n",
    "plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
    "plt.legend(frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d6ee0c24-e14e-4c40-97d4-49879229790c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1047489516.py:6: FutureWarning:\n",
      "\n",
      "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "has_reset\n",
       "True     64192\n",
       "False    15545\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "EPS = 1e-6  # seuil numérique\n",
    "\n",
    "reset_candidates = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .apply(\n",
    "        lambda g: (\n",
    "            (g[\"Quantity - AUM\"].abs() < EPS) &\n",
    "            (g[\"expected_stock\"].abs() < EPS)\n",
    "        ).any()\n",
    "    )\n",
    "    .reset_index(name=\"has_reset\")\n",
    ")\n",
    "\n",
    "reset_candidates[\"has_reset\"].value_counts()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "601f61b8-0115-431d-97de-6ec5a0f1d4f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Before repair  After repair  Repaired points\n",
      "0         756392         22357            18440\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/3061846510.py:66: FutureWarning:\n",
      "\n",
      "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "GAP_TOL = 1e-6\n",
    "REL_GAP_THR = 0.05\n",
    "MIN_PERSISTENCE = 3\n",
    "\n",
    "df = merged_isin.copy().sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"corrected_aum\"] = df[\"Quantity - AUM\"]\n",
    "df[\"repair_flag\"] = False\n",
    "\n",
    "def repair_group(g):\n",
    "    g = g.copy()\n",
    "\n",
    "    obs = g[\"Quantity - AUM\"].values\n",
    "    flows = g[\"Quantity - NetFlows\"].values\n",
    "\n",
    "    corrected = obs.copy()\n",
    "\n",
    "    # Initial expected path\n",
    "    expected = np.empty_like(obs)\n",
    "    expected[0] = np.nan\n",
    "\n",
    "    for t in range(1, len(obs)):\n",
    "        expected[t] = corrected[t-1] + flows[t-1]\n",
    "\n",
    "    gap = obs - expected\n",
    "    rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
    "\n",
    "    idx = None\n",
    "\n",
    "    for i in range(1, len(obs) - MIN_PERSISTENCE):\n",
    "        if (\n",
    "            rel_gap[i] > REL_GAP_THR\n",
    "            and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
    "            and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
    "        ):\n",
    "            idx = i\n",
    "            break\n",
    "\n",
    "    if idx is None:\n",
    "        return g\n",
    "\n",
    "    # Apply correction\n",
    "    shift = gap[idx]\n",
    "    corrected[idx:] = obs[idx:] - shift\n",
    "\n",
    "    g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
    "\n",
    "    # Rebuild expected stock AFTER correction\n",
    "    expected_corr = np.empty_like(obs)\n",
    "    expected_corr[0] = np.nan\n",
    "\n",
    "    for t in range(1, len(obs)):\n",
    "        expected_corr[t] = corrected[t-1] + flows[t-1]\n",
    "\n",
    "    g[\"corrected_aum\"] = corrected\n",
    "    g[\"expected_stock_corr\"] = expected_corr\n",
    "\n",
    "    return g\n",
    "\n",
    "\n",
    "df = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"], group_keys=False)\n",
    "    .apply(repair_group)\n",
    ")\n",
    "\n",
    "# Recompute gaps & ruptures\n",
    "df[\"gap_before\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_after\"] = df[\"corrected_aum\"] - df[\"expected_stock_corr\"]\n",
    "\n",
    "df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_TOL\n",
    "df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_TOL\n",
    "\n",
    "summary = pd.DataFrame({\n",
    "    \"Before repair\": [df[\"rupture_before\"].sum()],\n",
    "    \"After repair\": [df[\"rupture_after\"].sum()],\n",
    "    \"Repaired points\": [df[\"repair_flag\"].sum()]\n",
    "})\n",
    "\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "62583cfe-a6e7-4931-a63e-4273dca97ff7",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_final' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplotly\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgraph_objects\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mgo\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df_final = \u001b[43mdf_final\u001b[49m.rename(columns={\n\u001b[32m      5\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      6\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      7\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      8\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      9\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     10\u001b[39m })\n\u001b[32m     12\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     13\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m]\n",
      "\u001b[31mNameError\u001b[39m: name 'df_final' is not defined"
     ]
    }
   ],
   "source": [
    "import plotly.graph_objects as go\n",
    "import pandas as pd\n",
    "\n",
    "# ============================================================\n",
    "# Parameters (fixed epsilon)\n",
    "# ============================================================\n",
    "GAP_EPS = 100   # fixed tolerance for accounting identity\n",
    "\n",
    "# ============================================================\n",
    "# 1. Define ruptures using a FIXED epsilon\n",
    "# ============================================================\n",
    "df = df.copy()\n",
    "\n",
    "df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_EPS\n",
    "df[\"rupture_after\"]  = df[\"gap_after\"].abs()  > GAP_EPS\n",
    "\n",
    "# ============================================================\n",
    "# 2. Rupture ratios BEFORE repair\n",
    "# ============================================================\n",
    "rupture_summary_before = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_obs=(\"rupture_before\", \"count\"),\n",
    "        n_ruptures=(\"rupture_before\", \"sum\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "rupture_summary_before[\"rupture_ratio\"] = (\n",
    "    rupture_summary_before[\"n_ruptures\"] /\n",
    "    rupture_summary_before[\"n_obs\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 3. Rupture ratios AFTER repair\n",
    "# ============================================================\n",
    "rupture_summary_after = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_obs=(\"rupture_after\", \"count\"),\n",
    "        n_ruptures=(\"rupture_after\", \"sum\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "rupture_summary_after[\"rupture_ratio\"] = (\n",
    "    rupture_summary_after[\"n_ruptures\"] /\n",
    "    rupture_summary_after[\"n_obs\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 4. Rupture intensity classes (fixed bins)\n",
    "# ============================================================\n",
    "bins = [0.0, 0.01, 0.10, 0.30, 1.0]\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rupture_summary_before[\"rupture_class\"] = pd.cut(\n",
    "    rupture_summary_before[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "rupture_summary_after[\"rupture_class\"] = pd.cut(\n",
    "    rupture_summary_after[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 5. Distribution (%)\n",
    "# ============================================================\n",
    "dist_before = (\n",
    "    rupture_summary_before[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "dist_after = (\n",
    "    rupture_summary_after[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# ============================================================\n",
    "# 6. Donut chart: BEFORE vs AFTER (fixed epsilon)\n",
    "# ============================================================\n",
    "fig = go.Figure()\n",
    "\n",
    "fig.add_trace(go.Pie(\n",
    "    labels=dist_before.index,\n",
    "    values=dist_before.values,\n",
    "    hole=0.45,\n",
    "    name=\"Before repair\",\n",
    "    domain=dict(x=[0.0, 0.48]),\n",
    "    textinfo=\"percent\",\n",
    "    hoverinfo=\"label+percent\"\n",
    "))\n",
    "\n",
    "fig.add_trace(go.Pie(\n",
    "    labels=dist_after.index,\n",
    "    values=dist_after.values,\n",
    "    hole=0.45,\n",
    "    name=\"After repair\",\n",
    "    domain=dict(x=[0.52, 1.0]),\n",
    "    textinfo=\"percent\",\n",
    "    hoverinfo=\"label+percent\"\n",
    "))\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Distribution of AUM–flow rupture intensity before vs after repair (fixed ε)\",\n",
    "    annotations=[\n",
    "        dict(text=\"Before repair\", x=0.24, y=0.5, showarrow=False),\n",
    "        dict(text=\"After repair\",  x=0.76, y=0.5, showarrow=False),\n",
    "    ],\n",
    "    legend=dict(\n",
    "        orientation=\"h\",\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "70cf0a99-bd19-41a9-9574-88647fde09ca",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[31]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m      5\u001b[39m df_final = df.copy()\n\u001b[32m      7\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      8\u001b[39m \u001b[38;5;66;03m# Core variables (before / after)\u001b[39;00m\n\u001b[32m      9\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m df_final = \u001b[43mdf_final\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m     11\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     12\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     13\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     14\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - AUM\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     15\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     16\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - NetFlows\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     17\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     18\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock_corr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     19\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_before\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     20\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_after\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     21\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m     22\u001b[39m \u001b[43m]\u001b[49m\u001b[43m]\u001b[49m.rename(columns={\n\u001b[32m     23\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     24\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     25\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     26\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     27\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     28\u001b[39m })\n\u001b[32m     30\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     31\u001b[39m \u001b[38;5;66;03m# Relative gaps\u001b[39;00m\n\u001b[32m     32\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     33\u001b[39m df_final[\u001b[33m\"\u001b[39m\u001b[33mgap_rel_before\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m     34\u001b[39m     df_final[\u001b[33m\"\u001b[39m\u0
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m   4117\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m   4118\u001b[39m         key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m     indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m   4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m   4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m   6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   6210\u001b[39m     keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m   6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m   6216\u001b[39m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m   6261\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mKeyError\u001b[39m: \"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\""
     ]
    }
   ],
   "source": [
    "# ============================================================\n",
    "# FINAL DATASETS AFTER REPAIR\n",
    "# ============================================================\n",
    "\n",
    "df_final = df.copy()\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Core variables (before / after)\n",
    "# ------------------------------------------------------------\n",
    "df_final = df_final[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\",\n",
    "    \"corrected_aum\",\n",
    "    \"Quantity - NetFlows\",\n",
    "    \"expected_stock\",\n",
    "    \"expected_stock_corr\",\n",
    "    \"gap_before\",\n",
    "    \"gap_after\",\n",
    "    \"repair_flag\"\n",
    "]].rename(columns={\n",
    "    \"Quantity - AUM\": \"aum_raw\",\n",
    "    \"corrected_aum\": \"aum_repaired\",\n",
    "    \"Quantity - NetFlows\": \"flows\",\n",
    "    \"expected_stock\": \"expected_aum_raw\",\n",
    "    \"expected_stock_corr\": \"expected_aum_repaired\"\n",
    "})\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Relative gaps\n",
    "# ------------------------------------------------------------\n",
    "df_final[\"gap_rel_before\"] = (\n",
    "    df_final[\"gap_before\"].abs() /\n",
    "    df_final[\"expected_aum_raw\"].abs().clip(lower=1)\n",
    ")\n",
    "\n",
    "df_final[\"gap_rel_after\"] = (\n",
    "    df_final[\"gap_after\"].abs() /\n",
    "    df_final[\"expected_aum_repaired\"].abs().clip(lower=1)\n",
    ")\n",
    "df_final.to_csv('df_repaired.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "befb2962-73fb-4cb8-b86e-3218ec103204",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ============================================================\n",
    "# TYPE 3 REPAIR — TEMPORARY RESET TO ZERO (ONE BLOCK)\n",
    "# ============================================================\n",
    "\n",
    "df_type3 = df_repaired.copy()\n",
    "df_type3 = df_type3.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# Create lead/lag variables\n",
    "df_type3[\"aum_prev\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df_type3[\"aum_next\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(-1)\n",
    "\n",
    "df_type3[\"flow_prev\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1)\n",
    "\n",
    "df_type3[\"flow_next\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(-1)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Detection of temporary reset\n",
    "# ------------------------------------------------------------\n",
    "df_type3[\"type3_flag\"] = (\n",
    "    (df_type3[\"Quantity - AUM\"] == 0)\n",
    "    & (df_type3[\"aum_prev\"] > 0)\n",
    "    & (df_type3[\"aum_next\"] == df_type3[\"aum_prev\"])\n",
    "    & (df_type3[\"flow_prev\"].fillna(0) == 0)\n",
    "    & (df_type3[\"Quantity - NetFlows\"] == 0)\n",
    "    & (df_type3[\"flow_next\"].fillna(0) == 0)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Repair: smooth the glitch (replace 0 by previous stock)\n",
    "# ------------------------------------------------------------\n",
    "df_type3.loc[df_type3[\"type3_flag\"], \"Quantity - AUM\"] = (\n",
    "    df_type3.loc[df_type3[\"type3_flag\"], \"aum_prev\"]\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Recompute temporal chain AFTER repair\n",
    "# ------------------------------------------------------------\n",
    "df_type3[\"prev_stock\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df_type3[\"prev_flows\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df_type3[\"expected_stock\"] = (\n",
    "    df_type3[\"prev_stock\"] + df_type3[\"prev_flows\"]\n",
    ")\n",
    "\n",
    "df_type3[\"gap\"] = df_type3[\"Quantity - AUM\"] - df_type3[\"expected_stock\"]\n",
    "df_type3[\"gap_abs\"] = df_type3[\"gap\"].abs()\n",
    "df_type3[\"gap_rel\"] = (\n",
    "    df_type3[\"gap_abs\"] /\n",
    "    df_type3[\"expected_stock\"].abs().clip(lower=1)\n",
    ")\n",
    "\n",
    "df_type3[\"rupture_flag\"] = (\n",
    "    df_type3[\"prev_stock\"].notna()\n",
    "    & (df_type3[\"gap_abs\"] > TAU_ABS)\n",
    "    & (df_type3[\"gap_rel\"] > TAU_REL)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Diagnostic output\n",
    "# ------------------------------------------------------------\n",
    "n_type3 = df_type3[\"type3_flag\"].sum()\n",
    "print(f\"Temporary reset glitches repaired (Type 3): {n_type3}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fc44ed4-829f-4a8a-985a-31350bdbdf6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 1. Sélection des ISIN avec exactement 1 rupture\n",
    "# ------------------------------------------------------------\n",
    "one_rupture_isin = rupture_isin_summary[\n",
    "    rupture_isin_summary[\"n_ruptures\"] == 1\n",
    "][[\"Registrar Account - ID\", \"Product - Isin\"]].head(100)\n",
    "\n",
    "results = []\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Boucle de correction test\n",
    "# ------------------------------------------------------------\n",
    "for _, row in one_rupture_isin.iterrows():\n",
    "    acc = row[\"Registrar Account - ID\"]\n",
    "    isin = row[\"Product - Isin\"]\n",
    "\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == acc) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].sort_values(\"Centralisation Date\").copy()\n",
    "\n",
    "    # Localiser la rupture\n",
    "    rupture_idx = sub.index[sub[\"rupture_flag\"]]\n",
    "\n",
    "    if sub.index.get_loc(rupture_idx[0]) > 1:\n",
    "        #print(sub[[\"Centralisation Date\", \"Quantity - AUM\", \"expected_stock\", \"gap\", \"rupture_flag\"]].head(100))\n",
    "        continue\n",
    "\n",
    "    # Vérifier si la rupture est à la première date\n",
    "    first_idx = sub.index[0]\n",
    "    if rupture_idx[0] != first_idx:\n",
    "        continue\n",
    "\n",
    "    # ----- Réparation : décaler expected_stock -----\n",
    "    sub[\"expected_stock_fixed\"] = sub[\"expected_stock\"].shift(-1)\n",
    "\n",
    "    # Recalcul des gaps\n",
    "    sub[\"gap_fixed\"] = sub[\"Quantity - AUM\"] - sub[\"expected_stock_fixed\"]\n",
    "    sub[\"gap_abs_fixed\"] = sub[\"gap_fixed\"].abs()\n",
    "    sub[\"gap_rel_fixed\"] = sub[\"gap_abs_fixed\"] / sub[\"expected_stock_fixed\"].abs().clip(lower=1)\n",
    "\n",
    "    # Recalcul rupture\n",
    "    sub[\"rupture_fixed\"] = (\n",
    "        sub[\"expected_stock_fixed\"].notna()\n",
    "        & (sub[\"gap_abs_fixed\"] > TAU_ABS)\n",
    "        & (sub[\"gap_rel_fixed\"] > TAU_REL)\n",
    "    )\n",
    "\n",
    "    results.append({\n",
    "        \"Registrar Account - ID\": acc,\n",
    "        \"Product - Isin\": isin,\n",
    "        \"ruptures_before\": sub[\"rupture_flag\"].sum(),\n",
    "        \"ruptures_after\": sub[\"rupture_fixed\"].sum()\n",
    "    })\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Résultats agrégés\n",
    "# ------------------------------------------------------------\n",
    "repair_test = pd.DataFrame(results)\n",
    "\n",
    "summary = repair_test.groupby(\n",
    "    [\"ruptures_before\", \"ruptures_after\"]\n",
    ").size().reset_index(name=\"count\")\n",
    "\n",
    "repair_test, summary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d85728ca-55ba-4266-b881-23536eee4ba3",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"['corrected_aum'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[50]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m     10\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m] = pd.to_datetime(\n\u001b[32m     11\u001b[39m     stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     12\u001b[39m )\n\u001b[32m     14\u001b[39m \u001b[38;5;66;03m# 2. Build repair map\u001b[39;00m\n\u001b[32m     15\u001b[39m repair_map = (\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m     \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m     17\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     18\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     19\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     20\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     21\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m     22\u001b[39m \u001b[43m    \u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m     23\u001b[39m     .rename(columns={\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM repaired\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m     24\u001b[39m )\n\u001b[32m     26\u001b[39m \u001b[38;5;66;03m# 3. Merge repaired quantities\u001b[39;00m\n\u001b[32m     27\u001b[39m stocks_repaired = stocks_repaired.merge(\n\u001b[32m     28\u001b[39m     repair_map,\n\u001b[32m     29\u001b[39m     on=[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m     30\u001b[39m     how=\u001b[33m\"\u001b[39m\u001b[33mleft\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     31\u001b[39m )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m   4117\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m   4118\u001b[39m         key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m     indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m   4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m   4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m   6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   6210\u001b[39m     keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m   6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m   6216\u001b[39m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m   6261\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mKeyError\u001b[39m: \"['corrected_aum'] not in index\""
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# ============================================================\n",
    "# Rebuild STOCKS dataset using repaired AUM quantities\n",
    "# ============================================================\n",
    "\n",
    "# 1. Copy original stocks\n",
    "stocks_repaired = stocks.copy()\n",
    "stocks_repaired[\"Centralisation Date\"] = pd.to_datetime(\n",
    "    stocks_repaired[\"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# 2. Build repair map\n",
    "repair_map = (\n",
    "    df[[\n",
    "        \"Registrar Account - ID\",\n",
    "        \"Product - Isin\",\n",
    "        \"Centralisation Date\",\n",
    "        \"corrected_aum\",\n",
    "        \"repair_flag\"\n",
    "    ]]\n",
    "    .rename(columns={\"corrected_aum\": \"Quantity - AUM repaired\"})\n",
    ")\n",
    "\n",
    "# 3. Merge repaired quantities\n",
    "stocks_repaired = stocks_repaired.merge(\n",
    "    repair_map,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "# 4. Store original quantity\n",
    "stocks_repaired[\"Quantity - AUM original\"] = stocks_repaired[\"Quantity - AUM\"]\n",
    "\n",
    "# 5. Replace Quantity - AUM where repaired\n",
    "stocks_repaired[\"Quantity - AUM\"] = np.where(\n",
    "    stocks_repaired[\"repair_flag\"] == True,\n",
    "    stocks_repaired[\"Quantity - AUM repaired\"],\n",
    "    stocks_repaired[\"Quantity - AUM\"]\n",
    ")\n",
    "\n",
    "# 6. Recompute monetary values (unit value unchanged)\n",
    "stocks_repaired[\"nav_ccy\"] = (\n",
    "    stocks_repaired[\"Value - AUM CCY\"] /\n",
    "    stocks_repaired[\"Quantity - AUM original\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"nav_eur\"] = (\n",
    "    stocks_repaired[\"Value - AUM €\"] /\n",
    "    stocks_repaired[\"Quantity - AUM original\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"Value - AUM CCY\"] = (\n",
    "    stocks_repaired[\"Quantity - AUM\"] *\n",
    "    stocks_repaired[\"nav_ccy\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"Value - AUM €\"] = (\n",
    "    stocks_repaired[\"Quantity - AUM\"] *\n",
    "    stocks_repaired[\"nav_eur\"]\n",
    ")\n",
    "\n",
    "# 7. Cleanup helper columns\n",
    "stocks_repaired = stocks_repaired.drop(\n",
    "    columns=[\n",
    "        \"Quantity - AUM repaired\",\n",
    "        \"Quantity - AUM original\",\n",
    "        \"nav_ccy\",\n",
    "        \"nav_eur\"\n",
    "    ]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# Sanity checks (CORRECT WAY)\n",
    "# ============================================================\n",
    "\n",
    "# Share of observations repaired\n",
    "repair_share = stocks_repaired[\"repair_flag\"].mean()\n",
    "\n",
    "# Ensure only repaired points were modified\n",
    "n_modified = stocks_repaired[\"repair_flag\"].sum()\n",
    "\n",
    "print(f\"Share of repaired observations: {repair_share:.4%}\")\n",
    "print(f\"Number of repaired rows: {n_modified:,}\")\n",
    "\n",
    "stocks_repaired.to_csv('AUM_repaired.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f262605-49e8-4304-b11e-38c8bcfc6e3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(stocks[\"Registrar Account - ID\"].nunique())\n",
    "print(df[\"Registrar Account - ID\"].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37e9b599-aa51-4e03-b23c-2dd24e77fe38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"AUM_repaired.csv\")\n",
    "\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5cfb4526-7435-4e4a-ae48-0a8d40e39d81",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/55327206.py:8: DtypeWarning:\n",
      "\n",
      "Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n",
      "/tmp/ipykernel_1311/55327206.py:9: DtypeWarning:\n",
      "\n",
      "Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged dataset size: (9033269, 6)\n",
      "\n",
      "NUMBER OF MODIFIED OBSERVATIONS: 2263602\n",
      "Share modified: 25.06 %\n",
      "\n",
      "NEGATIVE AUM\n",
      "Before repair: 34374\n",
      "After repair : 36320\n",
      "\n",
      "RAW AUM DISTRIBUTION\n",
      "count    9.033269e+06\n",
      "mean     9.106935e+03\n",
      "std      1.915018e+05\n",
      "min     -9.918641e+06\n",
      "25%      0.000000e+00\n",
      "50%      0.000000e+00\n",
      "75%      3.091340e+02\n",
      "max      4.256300e+07\n",
      "Name: Quantity - AUM_raw, dtype: float64\n",
      "\n",
      "REPAIRED AUM DISTRIBUTION\n",
      "count    9.033269e+06\n",
      "mean     9.104329e+03\n",
      "std      1.914988e+05\n",
      "min     -9.918641e+06\n",
      "25%      0.000000e+00\n",
      "50%      0.000000e+00\n",
      "75%      3.088430e+02\n",
      "max      4.256300e+07\n",
      "Name: Quantity - AUM_repaired, dtype: float64\n",
      "\n",
      "TOTAL AUM\n",
      "Raw total : 82265397351.45718\n",
      "Repaired total : 82241848877.5126\n",
      "\n",
      "TOP 20 AUM CHANGES\n",
      "        Registrar Account - ID Product - Isin Centralisation Date  \\\n",
      "8532368       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532369       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532370       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477988       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477987       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477986       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477989       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532371       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477994       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477996       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477997       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928641       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928642       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928643       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928644       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477995       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532359       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8713983       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8713984       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8532357       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "\n",
      "         Quantity - AUM_raw  Quantity - AUM_repaired     aum_diff  \n",
      "8532368           41251.971              5298781.613  5257529.642  \n",
      "8532369           41251.971              5298781.613  5257529.642  \n",
      "8532370           41251.971              5298781.613  5257529.642  \n",
      "8477988         5298781.613                41251.971 -5257529.642  \n",
      "8477987         5298781.613                41251.971 -5257529.642  \n",
      "8477986         5298781.613                41251.971 -5257529.642  \n",
      "8477989         5298781.613                41251.971 -5257529.642  \n",
      "8532371           41251.971              5298781.613  5257529.642  \n",
      "8477994         5298781.613               128141.894 -5170639.719  \n",
      "8477996         5298781.613               128141.894 -5170639.719  \n",
      "8477997         5298781.613               128141.894 -5170639.719  \n",
      "8928641          128141.894              5298781.613  5170639.719  \n",
      "8928642          128141.894              5298781.613  5170639.719  \n",
      "8928643          128141.894              5298781.613  5170639.719  \n",
      "8928644          128141.894              5298781.613  5170639.719  \n",
      "8477995         5298781.613               128141.894 -5170639.719  \n",
      "8532359           41251.971              5059704.980  5018453.009  \n",
      "8713983         5059704.980                41251.971 -5018453.009  \n",
      "8713984         5059704.980                41251.971 -5018453.009  \n",
      "8532357           41251.971              5059704.980  5018453.009  \n",
      "\n",
      "ISIN WITH MOST MODIFICATIONS\n",
      "Product - Isin\n",
      "LU1623762769    0.535539\n",
      "LU2490324410    0.525588\n",
      "FR0013516044    0.524862\n",
      "LU2931971050    0.500000\n",
      "LU2931971217    0.500000\n",
      "FR001400TVB3    0.500000\n",
      "FR001400TU23    0.500000\n",
      "FR00140139F6    0.500000\n",
      "FR001400TVD9    0.500000\n",
      "LU2931971134    0.500000\n",
      "Name: aum_diff, dtype: float64\n",
      "\n",
      "REPAIR FLAG ERRORS: 2260454\n",
      "\n",
      "==============================\n",
      "COMPARISON COMPLETED\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# ============================================================\n",
    "# LOAD DATA\n",
    "# ============================================================\n",
    "\n",
    "aum_raw = pd.read_csv(\"stocks.csv\")          # fichier original\n",
    "aum_rep = pd.read_csv(\"AUM_repaired.csv\")    # fichier réparé\n",
    "\n",
    "aum_raw[\"Centralisation Date\"] = pd.to_datetime(aum_raw[\"Centralisation Date\"])\n",
    "aum_rep[\"Centralisation Date\"] = pd.to_datetime(aum_rep[\"Centralisation Date\"])\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# KEEP SAME KEYS\n",
    "# ============================================================\n",
    "\n",
    "keys = [\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\"\n",
    "]\n",
    "\n",
    "aum_raw = aum_raw[keys + [\"Quantity - AUM\"]]\n",
    "aum_rep = aum_rep[keys + [\"Quantity - AUM\", \"repair_flag\"]]\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# MERGE DATASETS\n",
    "# ============================================================\n",
    "\n",
    "df = aum_raw.merge(\n",
    "    aum_rep,\n",
    "    on=keys,\n",
    "    how=\"inner\",\n",
    "    suffixes=(\"_raw\", \"_repaired\")\n",
    ")\n",
    "\n",
    "print(\"Merged dataset size:\", df.shape)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 1. HOW MANY VALUES CHANGED\n",
    "# ============================================================\n",
    "\n",
    "df[\"aum_diff\"] = df[\"Quantity - AUM_repaired\"] - df[\"Quantity - AUM_raw\"]\n",
    "\n",
    "n_changed = (df[\"aum_diff\"] != 0).sum()\n",
    "\n",
    "print(\"\\nNUMBER OF MODIFIED OBSERVATIONS:\", n_changed)\n",
    "print(\"Share modified:\", round(n_changed / len(df) * 100, 2), \"%\")\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 2. NEGATIVE AUM BEFORE / AFTER\n",
    "# ============================================================\n",
    "\n",
    "neg_before = (df[\"Quantity - AUM_raw\"] < 0).sum()\n",
    "neg_after = (df[\"Quantity - AUM_repaired\"] < 0).sum()\n",
    "\n",
    "print(\"\\nNEGATIVE AUM\")\n",
    "print(\"Before repair:\", neg_before)\n",
    "print(\"After repair :\", neg_after)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 3. DISTRIBUTION COMPARISON\n",
    "# ============================================================\n",
    "\n",
    "print(\"\\nRAW AUM DISTRIBUTION\")\n",
    "print(df[\"Quantity - AUM_raw\"].describe())\n",
    "\n",
    "print(\"\\nREPAIRED AUM DISTRIBUTION\")\n",
    "print(df[\"Quantity - AUM_repaired\"].describe())\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 4. TOTAL AUM COMPARISON\n",
    "# ============================================================\n",
    "\n",
    "print(\"\\nTOTAL AUM\")\n",
    "\n",
    "print(\"Raw total :\", df[\"Quantity - AUM_raw\"].sum())\n",
    "print(\"Repaired total :\", df[\"Quantity - AUM_repaired\"].sum())\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 5. LARGEST MODIFICATIONS\n",
    "# ============================================================\n",
    "\n",
    "largest_changes = df.sort_values(\n",
    "    \"aum_diff\",\n",
    "    key=lambda x: x.abs(),\n",
    "    ascending=False\n",
    ").head(20)\n",
    "\n",
    "print(\"\\nTOP 20 AUM CHANGES\")\n",
    "\n",
    "print(\n",
    "    largest_changes[\n",
    "        [\n",
    "            \"Registrar Account - ID\",\n",
    "            \"Product - Isin\",\n",
    "            \"Centralisation Date\",\n",
    "            \"Quantity - AUM_raw\",\n",
    "            \"Quantity - AUM_repaired\",\n",
    "            \"aum_diff\"\n",
    "        ]\n",
    "    ]\n",
    ")\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 6. WHICH ISIN WERE MOST MODIFIED\n",
    "# ============================================================\n",
    "\n",
    "isin_changes = (\n",
    "    df.groupby(\"Product - Isin\")[\"aum_diff\"]\n",
    "    .apply(lambda x: (x != 0).mean())\n",
    "    .sort_values(ascending=False)\n",
    "    .head(10)\n",
    ")\n",
    "\n",
    "print(\"\\nISIN WITH MOST MODIFICATIONS\")\n",
    "print(isin_changes)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 7. CHECK REPAIR FLAG CONSISTENCY\n",
    "# ============================================================\n",
    "\n",
    "if \"repair_flag\" in df.columns:\n",
    "\n",
    "    repair_flag_errors = (\n",
    "        (df[\"repair_flag\"] == False) &\n",
    "        (df[\"Quantity - AUM_raw\"] != df[\"Quantity - AUM_repaired\"])\n",
    "    ).sum()\n",
    "\n",
    "    print(\"\\nREPAIR FLAG ERRORS:\", repair_flag_errors)\n",
    "\n",
    "\n",
    "print(\"\\n==============================\")\n",
    "print(\"COMPARISON COMPLETED\")\n",
    "print(\"==============================\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "976dd82c-5c16-44e6-aa5d-65d085714b25",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1498669893.py:8: DtypeWarning:\n",
      "\n",
      "Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# ============================================================\n",
    "# 1. LOAD DATA\n",
    "# ============================================================\n",
    "aum = pd.read_csv(\"AUM_repaired.csv\")\n",
    "\n",
    "flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
    "aum[\"Centralisation Date\"] = pd.to_datetime(aum[\"Centralisation Date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "66c011b5-aed1-428e-bd18-44d8d814c283",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "hole": 0.45,
         "hoverinfo": "label+percent",
         "labels": [
          "Clean / quasi-clean (≤1%)",
          "Moderate (1–10%)",
          "High (10–30%)",
          "Severe (>30%)"
         ],
         "textinfo": "percent",
         "type": "pie",
         "values": {
          "bdata": "mpmZmZlZR0BmZmZmZmY7QM3MzMzMzCpAAAAAAAAAKUA=",
          "dtype": "f8"
         }
        }
       ],
       "layout": {
        "legend": {
         "orientation": "h",
         "title": {
          "text": "Rupture ratio"
         },
         "x": 0.5,
         "xanchor": "center",
         "y": -0.15,
         "yanchor": "top"
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermap": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermap"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        },
        "title": {
         "text": "Rupture intensity distribution (AUM repaired)"
        }
       }
      },
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAzkAAAFoCAYAAAB0XzViAAAQAElEQVR4AeydB5wkRb3H/zO7ezlyEY47OHJGkCw8kghHECUKiAkQHpIEJRwIB+IRJAgoTxSeoAQfSZLkcEqSAzklHRw5X863u7c74c2vz1p652ZmZ3YndPd8+VDX3RX+9f9/q6a6/l3VvfE0/0EAAhCAAAQgAAEIQAACEIgQgbjxHwQgkIMAURCAAAQgAAEIQAACYSWAkxPWlkNvCEAAArUgQJ0QgAAEIACBEBDAyQlBI6EiBCAAAQhAAALBJoB2EIBAsAjg5ASrPdAGAhCAAAQgAAEIQAACUSFQMztwcmqGnoohAAEIQAACEIAABCAAgUoQwMmpBFVklo8AkiAAAQhAAAIQgAAEIFAiAZycEoGRHQIQgEAQCKADBCAAAQhAAAL5CeDk5GdDCgQgAAEIQAAC4SKAthCAAAQ8Ajg5Hgb+gQAEIAABCEAAAhCAQFQJ1J9dODn11+ZYDAEIQAACEIAABCAAgUgTwMmJdPOWzzgkQQACEIAABCAAAQhAICwEcHLC0lLoCQEIBJEAOkEAAhCAAAQgEEACODkBbBRUggAEIAABCISbANpDAAIQqC0BnJza8qd2CEAAAhCAAAQgAIF6IYCdVSOAk1M11FQEAQhAAAIQgAAEIAABCFSDAE5ONSiXrw4kQQACEIAABCAAAQhAAAJdEMDJ6QIQyRCAQBgIoCMEIAABCEAAAhD4ggBOzhcsOIMABCAAAQhEiwDWQAACEKhTAjg5ddrwmA0BCEAAAhCAAATqlQB2R58ATk702xgLIQABCEAAAhCAAAQgUFcEcHK61dwUggAEIAABCEAAAhCAAASCSgAnJ6gtg14QCCMBdIYABCAAAQhAAAIBIICTE4BGQAUIQAACEIg2AayDAAQgAIHqEsDJqS5vaoMABCAAAQhAAAIQWEGAfyFQMQI4ORVDi2AIQAACEIAABCAAAQhAoBYEwu3k1IIYdUIAAhCAAAQgAAEIQAACgSaAkxPo5kE5CHSPAKUgAAEIQAACEIBAPRPAyann1sd2CEAAAvVFAGshAAEIQKBOCODk1ElDYyYEIAABCEAAAhDITYBYCESPAE5O9NoUiyAAAQhAAAIQgAAEIFDXBMri5NQ1QYyHAAQgAAEIQAACEIAABAJFACcnUM2BMhEjgDkQgAAEIAABCEAAAjUggJNTA+hUCQEIQKC+CWA9BCAAAQhAoLIEcHIqyxfpEIAABCAAAQhAoDgC5IIABMpGACenbCgRBAEIQAACEIAABCAAAQiUm0B35OHkdIcaZSAAAQhAAAIQgAAEIACBwBLAyQls06BY+Qj0TNLrb31g2+97vD313LSeCSpj6Rtue9DTSbqVUWzgRc2Zt9D2Ovz0urQ98I3TDQXVj9WealcVV3/Wb80fp3gCBCAAAQhAoFQCoXFydBPUjW/jXb5n2UE3ylIN705+dwOuVn3d0VG6iU93JuSO8cSLft+dqitSJtueoOgoRpqMqU8Ua7jyqoxsKraM8rlyqlPXlQqunlz6Ka67/aqc+l75uzs8cffdNNk2Xn9N79z/j+sf0jUfL9midpC9/rLuXOU01kiW4ppbWu0HP77EG3eUprjsoN+b6iwkN7tMIK6zlBAb2SF7spKqcqk2VduqsjMuvM7EXucECEAAAhCAQKkEQuPkOMP23/Mr9vqUGzvCryefbFdcd7vlm3y4chwhAIFwE9DE+95HnrUzTjjMRgwbktOY19563xYtXmqrrzrCXn71bXOOSs7M3Yh86tlplu0caSL+pzse7YY0iuQioLZVG78wbbop5MpDHAQqTQD5EIBA+AmEzsnJRr7rDluYHJ9ck4/svPVwfdRhe3sO4K4ZLlGwN6j2TD7rGHv+gWtzriaUm7uebqsu1Vlu2cXKq3U7OEdi2y02NIV8ej/2t5dsw3XXsBOPOsA+/my2yenJl7fU+I3WW9NWX22k3XL3Y52KaiI+/e0PbfedtuwUH8aLWrezY6Y2VpDzqLZ38RwhAAEIQKCmBEJVeeidnHy0te3Cv+XE5dPT4O33Pb7T01itAinvGzM+8Pb7a7uGgraouBusnt4efdqltnhps7dypHQFlbXMf92p74OPZ3Zsg/HrpDpVt+S7IPmZarr8X/n8slRAOu51+OlWyD498T7yxMnexFBPy1290kP6SI6C5Ls0HbPTxUn1K5+C8rgg9pLhD9LNpesoPaWLyyMZkie5is+n40efzPTaTvJcWXdUOcnNleby+I/KJ11cOHXSb/zJ3rnySKZkexGZf6SjdHXldJT+mSSvv3XVf1RWMiRbZRXETHFKc7Ikz4VlzV9spVL+bJ2kn+Kyy6pN1XaqS7JUR7H6Kb8LKifdVLeCzhXn0nVU3YqfOu3Njv6uvNJL+ilPV+H9j2aaHImvbLOp9evbJ2d21auHHcqjCfLYjEMipydn5m5EDuzf13bM1K86VJcToTp2/coWtvlG67ioLo/iLvt7MgZIB3EVXwUxdUH9xq9EdrryqaxkZOfzx6t9pKf0VV6lqaz6jvqQyro8inchu/58+R568gUlrRTUxkce/DWvzdX2K2UgAgIQgAAEINAFgdA7ObrRzpw933t6O37c6C7MzZ+sp76nTrrW/nTNRG8l5MWHfutlPmHiVd6+cD1Nv/7y023QgH526rGHeHm0ba67T9dV32H/fYGddtyhniw9qVcdmkjsfvCpNnrkKl686phy16/sjvun9GhLnurz2yeZn82aZxf+6k+endoiIts1MdTKmOpV+N8rz/AmleKsiY30UFmlKUjPA446d6VtQdpCKMHKoyBmEyf/3pvsK15BEydtKXLyxHy1UcMs3178QjqOW320HbzfLpY9AVU9epqvLUxHHLCHLvMGZ6NfJ+l+5omHe85t3oKZBLWbnISjj9i3o93cVkpN+NS2XfUfOdCHHDvJ9th5qw4ZhVbklP/kn13d0Yek65abrmtyBDXxzKhV9P/F6JctTHZJ38kTj+nQV/YrTmn+/Nm6dtXW/rI6/8fLb+hg2225kXfM9Y8/j/qKWKgtS2WRS7aL029j8KAB5upSu//zlRnWVd9y5f1H/SbLMQZk/9bU7zRuZbeBfoPqIy6ordRnZYNfr1znevBx+W//z5644wqvrd24oDp2OfAU77fn5N5+3SSbmPmty7FyslTH17870dQmLp+O6621usuy0nHksKFenGPtXfAPBCAAAQhAoEgCoXdybrvnSW/ftp766elfkXavlE2Te03yNTlSomRJpraiKCiunEH16QVbTS79crUVZsN117BzTjmyI1o65ZvAd2Tq4kT1+e1zMoudBIqBnqRfft7xnd6H0ORODsR9jz7XSQNNqLT1xUV+/Ws7mH9yqImn6takR7oon5hr8rTnLlvrsuSQXYcEyHHRlhcx7coJzmejJltybiUvX3ATMf8kXA6KJnz9+/bNV6xTvOpQfpXrlJDnQvnlOPn7UL72yCOi29Fqv0t+fZu3VdSvr9pcjoDSlMdVkK2r2lq/L/WpYp7Uv/vBp17/GTl8iBPZ6ah2fnbqq50edlSCherXb1HOvuzT7/XLm63XrW2L+k26McBvjGSqvxY7BmT/1rSKpaB+Ly6SrXZR0LkLuX4vLi37qDbVb1Pt5tIkW3UozS9b/XHXzMqWY6T8uWxS/NprjtEhZ9DvVRzU9jkzEAkBCEAAAhAoQCB0To6eKG7s+8KaboB6IuifaBWwt6QkN7l978PPSyrX3cyaNGnir+02/smE5LnJ8+x5C3RZtiAHZfbchV3K05YcTTg08fBn1rXi1Q7++HznLl//fn1MqzZqT/8TX5U7dP/dvNUjnZcS5CzJafJPrjSB1kRaE+psptmyZaMcMU1ks9O6ul5rjVW91Z7Tzr+206qWJnzbbLFBV8XLli7dZYPjXDbBWYLUZ9R3tOqUlWSauCpNebLTsq+1wlNsn1Z/Ub/JlqFr187+347rm3J+NCFXvnIEOQeS8/Nf/dFbOczFQOndCeUYA9TPtcKqlVptZ3R6aDXFbTfTGKoVGK0mufRSj455LvsV5/qAs0k6Sbd
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ============================================================\n",
    "# 2. PREPARE FLOWS\n",
    "# ============================================================\n",
    "\n",
    "flows_clean = (\n",
    "    flows\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 3. MERGE\n",
    "# ============================================================\n",
    "\n",
    "df = aum.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# ============================================================\n",
    "# 4. SORT\n",
    "# ============================================================\n",
    "\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# REBUILD ACCOUNTING IDENTITY WITH REPAIRED AUM\n",
    "# ============================================================\n",
    "\n",
    "df[\"prev_aum\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df[\"prev_flow\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
    "\n",
    "# ============================================================\n",
    "# COMPUTE GAP\n",
    "# ============================================================\n",
    "\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "EPS = 10\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_aum\"].notna()\n",
    "    & (df[\"gap_abs\"] > EPS)\n",
    ")\n",
    "# ============================================================\n",
    "# 6. COMPUTE GAP\n",
    "# ============================================================\n",
    "\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "EPS = 10\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_aum\"].notna()\n",
    "    & (df[\"gap_abs\"] > EPS)\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 7. BUILD RUPTURE SUMMARY\n",
    "# ============================================================\n",
    "\n",
    "rupture_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\",\"sum\"),\n",
    "        total_obs=(\"rupture_flag\",\"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\",\"mean\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 8. SAME CLASSIFICATION AS YOUR CODE\n",
    "# ============================================================\n",
    "\n",
    "rs = rupture_summary.copy()\n",
    "\n",
    "bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
    "\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rs[\"rupture_class\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 9. DISTRIBUTION\n",
    "# ============================================================\n",
    "\n",
    "dist = (\n",
    "    rs[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# ============================================================\n",
    "# 10. DONUT CHART\n",
    "# ============================================================\n",
    "\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=dist.index,\n",
    "        values=dist.values,\n",
    "        hole=0.45,\n",
    "        textinfo=\"percent\",\n",
    "        hoverinfo=\"label+percent\"\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Rupture intensity distribution (AUM repaired)\",\n",
    "    legend=dict(\n",
    "        orientation=\"h\",\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "990898ea-ceca-46bb-bfb3-c87bf289d272",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter année / mois\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# 1. Nombre total de lignes par mois\n",
    "total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
    "\n",
    "# 2. Nombre de ruptures par mois\n",
    "ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
    "\n",
    "# 3. Merge pour obtenir total + ruptures\n",
    "ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
    "ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
    "\n",
    "# 4. Proportion (en %)\n",
    "ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
    "\n",
    "# 5. Pivot pour heatmap\n",
    "heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
    "\n",
    "# 6. Plot\n",
    "plt.figure(figsize=(14, 7))\n",
    "sns.heatmap(\n",
    "    heatmap_ratio, \n",
    "    cmap=\"Reds\",\n",
    "    linewidths=.3,\n",
    "    linecolor=\"grey\",\n",
    "    annot=True,\n",
    "    fmt=\".2%\",\n",
    "    cbar_kws={'label': 'Proportion de ruptures'}\n",
    ")\n",
    "\n",
    "plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
    "plt.xlabel(\"Mois\")\n",
    "plt.ylabel(\"Année\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d335589-c519-458d-857d-a051813b950b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter year / month au cas où\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# Merge géographique\n",
    "df = df.merge(\n",
    "    geo[[\"Registrar Account - ID\", \"country\"]],\n",
    "    on=\"Registrar Account - ID\",\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
    "\n",
    "# Total des lignes par pays\n",
    "total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
    "\n",
    "# Nombre de ruptures\n",
    "rupt_country = (\n",
    "    df[df[\"rupture_flag\"]]\n",
    "    .groupby(\"country\")\n",
    "    .size()\n",
    "    .reset_index(name=\"ruptures\")\n",
    ")\n",
    "\n",
    "# Merge + ratios\n",
    "country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
    "country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
    "country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
    "\n",
    "# Tri (rupture ratio décroissant)\n",
    "country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a45a111-25da-4f5c-9723-c3efd25c906d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# On ajoute une colonne en % pour l’affichage\n",
    "import plotly.express as px\n",
    "\n",
    "country_stats_plot = country_stats.copy()\n",
    "country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
    "\n",
    "# Tri décroissant par proportion de ruptures\n",
    "country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
    "\n",
    "fig = px.bar(\n",
    "    country_stats_plot,\n",
    "    x=\"country\",\n",
    "    y=\"rupture_ratio\",\n",
    "    hover_data={\n",
    "        \"rupture_pct\": ':.2f',\n",
    "        \"ruptures\": True,\n",
    "        \"total_obs\": True,\n",
    "        \"rupture_ratio\": False,  # on cache la version décimale\n",
    "    },\n",
    "    labels={\n",
    "        \"country\": \"Pays\",\n",
    "        \"rupture_ratio\": \"Proportion de ruptures\",\n",
    "        \"rupture_pct\": \"% de ruptures\",\n",
    "        \"ruptures\": \"Nb de ruptures\",\n",
    "        \"total_obs\": \"Nb d'observations\"\n",
    "    },\n",
    "    title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
    ")\n",
    "\n",
    "# Format en %\n",
    "fig.update_yaxes(tickformat=\".1%\")\n",
    "\n",
    "fig.update_layout(\n",
    "    xaxis_tickangle=-45,\n",
    "    bargap=0.2\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4af9841-6cf9-4d27-8096-ac878e866bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "rs = rupture_summary.copy()\n",
    "\n",
    "# 1. Stats numériques classiques\n",
    "print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
    "print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
    "\n",
    "\n",
    "# 2. Distribution par classes (bins)\n",
    "\n",
    "rs[\"rupture_bucket\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
    "    labels=[\n",
    "        \"0–0.1%\",\n",
    "        \"0.1–1%\",\n",
    "        \"1–5%\",\n",
    "        \"5–10%\",\n",
    "        \"10–25%\",\n",
    "        \"25–50%\",\n",
    "        \"50–100%\"\n",
    "    ],\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# Ajouter la catégorie \"0%\"\n",
    "rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
    "\n",
    "# Remplacer les 0% exacts\n",
    "rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
    "\n",
    "bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
    "print(bucket_counts)\n",
    "\n",
    "\n",
    "# 3. Pourcentages\n",
    "bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
    "\n",
    "print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
    "print(bucket_percent)\n",
    "\n",
    "\n",
    "# 4. Nombre de comptes totalement propres\n",
    "no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
    "print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
    "\n",
    "# 5. Comptes extrêmement problématiques\n",
    "severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
    "\n",
    "medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f39a9a5a-5f4e-4cac-9f63-e6952582b6ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "\n",
    "fig = px.histogram(\n",
    "    rs,\n",
    "    x=\"rupture_ratio\",\n",
    "    nbins=50,\n",
    "    title=\"Distribution du rupture_ratio\",\n",
    "    labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
    ")\n",
    "fig.update_layout(bargap=0.05)\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70132995-8379-44b6-8ff6-f09524c4e4d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. Filtres de base ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# Filtrer uniquement l'année 2021\n",
    "ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
    "\n",
    "print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
    "\n",
    "# --- 2. Classification du type de gap ---\n",
    "ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
    "\n",
    "# --- 3. Statistiques globales ---\n",
    "gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
    "gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
    "\n",
    "print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
    "print(gap_counts)\n",
    "print(\"\\n(%)\")\n",
    "print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
    "\n",
    "# --- 4. Intensité des écarts ---\n",
    "intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
    "print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
    "print(intensity_stats)\n",
    "\n",
    "# --- 5. Visualisation rapide ---\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
    "plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
    "plt.title(\"Distribution des gaps de rupture en 2021\")\n",
    "plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1faf943a-4703-4b19-a867-2670ac3a5209",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. ADD YEAR ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# --- 2. DEFINE PERIODS ---\n",
    "conditions = [\n",
    "    merged[\"year\"] < 2021,\n",
    "    merged[\"year\"] == 2021,\n",
    "    merged[\"year\"] > 2021\n",
    "]\n",
    "\n",
    "period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
    "\n",
    "merged[\"period\"] = np.select(\n",
    "    conditions,\n",
    "    period_labels,\n",
    "    default=\"unknown\"\n",
    ")\n",
    "\n",
    "# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
    "merged[\"gap_type\"] = np.where(\n",
    "    merged[\"gap\"] > 0, \"positive\",\n",
    "    np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 4. TOTAL OBS PER PERIOD ---\n",
    "total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
    "\n",
    "# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
    "rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
    "\n",
    "# --- 6. PROPORTION OF RUPTURES ---\n",
    "rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
    "\n",
    "# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
    "gap_dist = (\n",
    "    ruptures.groupby([\"period\", \"gap_type\"])\n",
    "    .size()\n",
    "    .groupby(level=0)\n",
    "    .apply(lambda x: (x / x.sum()) * 100)   # % par période\n",
    ")\n",
    "\n",
    "\n",
    "# --- 8. MERGE AND DISPLAY ---\n",
    "summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
    "summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
    "\n",
    "print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
    "print(summary)\n",
    "\n",
    "print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
    "print(gap_dist)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5abee764-b890-4ea1-8f98-5a0ff1512611",
   "metadata": {},
   "outputs": [],
   "source": [
    "from plotly.subplots import make_subplots\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. DEFINE PERIODS ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 2. Ensure gap_type exists + no missing categories ---\n",
    "ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"})  # zero is equivalent to no-flow change\n",
    "\n",
    "# --- 3. Compute gap counts ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 4. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 5. MAKE TWO DONUT CHARTS ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
    "    showlegend=True\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3aa3b8a0-f499-495a-9171-2e09d0bb1e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Compute gap counts by period ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 2. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before 2021\", \"After 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4f0dc74-649d-4105-9a1a-44a18d126a3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Define periods ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "# --- 2. Keep only ruptures ---\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 3. Count ruptures per period ---\n",
    "rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
    "    [\"Before Sep 2021\", \"After Sep 2021\"]\n",
    ").fillna(0)\n",
    "\n",
    "# --- 4. Pie chart ---\n",
    "fig = go.Figure(data=[\n",
    "    go.Pie(\n",
    "        labels=rupture_counts.index,\n",
    "        values=rupture_counts.values,\n",
    "        hole=0.45,\n",
    "        marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
    "        textinfo=\"percent+value\",\n",
    "    )\n",
    "])\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecccd73c-00a6-4ff3-b213-e85b98ec5a55",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# 1. Filtre sur la période post-Sept 2021\n",
    "cutoff = pd.Timestamp(\"2021-09-01\")\n",
    "post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
    "\n",
    "# 2. On ne garde que les ruptures\n",
    "post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# 3. Gap absolu + gap relatif (% du stock)\n",
    "post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
    "post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
    "\n",
    "# 4. Percentiles globaux\n",
    "p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
    "p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
    "p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
    "\n",
    "# 5. Classification automatique\n",
    "def classify_gap(gap, gap_rel, acct):\n",
    "    # RESET → énorme choc (technique)\n",
    "    if gap_abs >= p99 or gap_rel >= 0.90:\n",
    "        return \"reset\"\n",
    "\n",
    "    # SPIKE → très gros gap mais isolé\n",
    "    if gap_abs >= p95:\n",
    "        return \"spike\"\n",
    "\n",
    "    # SHIFT → décalage permanent\n",
    "    # Test : moyenne des gaps du compte\n",
    "    return None\n",
    "\n",
    "# Calcul du shift (décalage directionnel)\n",
    "shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
    "\n",
    "post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "post_rupt[\"gap_type2\"] = np.where(\n",
    "    post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
    "    np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
    "    np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
    " \n",
    "# 6. Statistiques globales\n",
    "stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
    "print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
    "print(stats)\n",
    "\n",
    "# 7. Stats par client\n",
    "client_stats = (\n",
    "    post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# 8. Stats par ISIN\n",
    "isin_stats = (\n",
    "    post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
    "print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
    "\n",
    "print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
    "print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2efc5e0-bc35-4fa7-ab5d-6be616964446",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- Data from your output ---\n",
    "labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
    "values = [50.4, 44.6, 4.0, 1.0]\n",
    "\n",
    "# --- Pie chart ---\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=labels,\n",
    "        values=values,\n",
    "        hole=0.35,                  # donut style (plus lisible)\n",
    "        textinfo='percent',\n",
    "        marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Typologie des ruptures depuis Septembre 2021\",\n",
    "    legend_title=\"Type de gap\",\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "744e04b6-3f34-40c9-95fe-a5605e7c7f02",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged[\"gap_abs\"] = merged[\"gap\"].abs()\n",
    "\n",
    "merged[\"gap_rel\"] = (\n",
    "    merged[\"gap_abs\"] /\n",
    "    merged[\"Quantity - AUM\"].replace(0, np.nan)\n",
    ")\n",
    "\n",
    "merged.loc[merged[\"rupture_flag\"], \"gap_rel\"].describe(\n",
    "    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d20625e-1045-4b7a-ab64-3381997e4131",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uniquement sur les ruptures\n",
    "df_r = merged[merged[\"rupture_flag\"]].copy()\n",
    "\n",
    "# seuils globaux (descriptifs, pas \"optimisés\")\n",
    "q90 = df_r[\"gap_abs\"].quantile(0.90)\n",
    "q99 = df_r[\"gap_abs\"].quantile(0.99)\n",
    "\n",
    "# moyenne directionnelle par compte\n",
    "avg_gap_by_account = (\n",
    "    df_r.groupby(\"Registrar Account - ID\")[\"gap\"]\n",
    "    .mean()\n",
    "    .rename(\"avg_gap\")\n",
    ")\n",
    "\n",
    "df_r = df_r.merge(avg_gap_by_account, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "def classify_gap(row):\n",
    "    if row[\"gap_abs\"] >= q99:\n",
    "        return \"reset\"\n",
    "    if row[\"gap_abs\"] >= q90:\n",
    "        return \"spike\"\n",
    "    if abs(row[\"avg_gap\"]) > row[\"gap_abs\"]:\n",
    "        return \"shift\"\n",
    "    return \"micro\"\n",
    "\n",
    "df_r[\"discontinuity_type\"] = df_r.apply(classify_gap, axis=1)\n",
    "df_r[\"discontinuity_type\"].value_counts(normalize=True) * 100\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02806629-e454-4e10-82be-6e2239091088",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "yearly_stats = merged.groupby(\"year\").agg(\n",
    "    total_obs=(\"gap\", \"count\"),\n",
    "    ruptures=(\"rupture_flag\", \"sum\")\n",
    ").reset_index()\n",
    "\n",
    "yearly_stats[\"rupture_rate\"] = (\n",
    "    yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2edf2c55-45e7-4aad-b4f9-5c35178abad6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "df_r = merged[merged[\"rupture_flag\"]].copy()\n",
    "\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.hist(df_r[\"gap_abs\"], bins=100, log=True)\n",
    "plt.title(\"Distribution of absolute gaps (log scale)\")\n",
    "plt.xlabel(\"Absolute gap\")\n",
    "plt.ylabel(\"Frequency (log)\")\n",
    "plt.show()\n",
    "\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.hist(df_r[\"gap_rel\"].dropna(), bins=100, log=True)\n",
    "plt.title(\"Distribution of relative gaps (|gap| / AUM)\")\n",
    "plt.xlabel(\"Relative gap\")\n",
    "plt.ylabel(\"Frequency (log)\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "981f2ec6-574b-41ea-b4bf-45be54aeda1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,4))\n",
    "plt.plot(yearly_stats[\"year\"], yearly_stats[\"rupture_rate\"], marker=\"o\")\n",
    "plt.title(\"Evolution of AUM–Flow inconsistency rate over time\")\n",
    "plt.xlabel(\"Year\")\n",
    "plt.ylabel(\"Rupture rate\")\n",
    "plt.grid(True)\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}