Project_Carmignac/brouillon/analyse_rupture.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "338730e2-a6de-4d4f-b438-efe3feb139ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "cfd11919-0941-400e-a516-72871881f733",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1940519970.py:1: DtypeWarning:\n",
      "\n",
      "Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n",
      "/tmp/ipykernel_1311/1940519970.py:2: DtypeWarning:\n",
      "\n",
      "Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "stocks=pd.read_csv('stocks.csv')\n",
    "flows = pd.read_csv('flows.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "b99e3402-fe26-4f4e-8c1c-5f07847bce94",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/3613746644.py:1: DtypeWarning:\n",
      "\n",
      "Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "merged = pd.read_csv('merged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "34e5a815-7269-4312-bfe6-e2cd12595e57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Prepare stock dataset ISIN-by-ISIN\n",
    "stocks_isin = stocks[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
    "\n",
    "stocks_isin = stocks_isin.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# 2. Prepare flows dataset ISIN-by-ISIN\n",
    "flows_isin = flows[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
    "\n",
    "flows_isin = (\n",
    "    flows_isin\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# 3. Merge stocks & flows ISIN-by-ISIN\n",
    "merged_isin = stocks_isin.merge(\n",
    "    flows_isin,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# 4. Compute expected stock per ISIN for each account\n",
    "merged_isin[\"prev_stock\"] = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - AUM\"]\n",
    "    .shift(1)\n",
    ")\n",
    "\n",
    "merged_isin[\"prev_netflows\"] = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])[\"Quantity - NetFlows\"]\n",
    "    .shift(1)\n",
    "    .fillna(0)\n",
    ")\n",
    "\n",
    "merged_isin[\"expected_stock\"] = (\n",
    "    merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
    ")\n",
    "\n",
    "# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
    "TOL = 1e-6\n",
    "\n",
    "merged_isin[\"gap\"] = (\n",
    "    merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
    ")\n",
    "\n",
    "merged_isin[\"rupture_flag\"] = (\n",
    "    merged_isin[\"prev_stock\"].notna()\n",
    "    & (merged_isin[\"gap\"].abs() > TOL)\n",
    ")\n",
    "\n",
    "# 6. Summarize ruptures per (Account, ISIN)\n",
    "rupture_isin_summary = (\n",
    "    merged_isin\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap\", lambda x: x.abs().max())\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# Sort by worst ISIN trajectories\n",
    "rupture_isin_summary = rupture_isin_summary.sort_values(\n",
    "    \"rupture_ratio\",\n",
    "    ascending=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16213cb2-07d8-4e82-b9bb-252554ec47b9",
   "metadata": {},
   "source": [
    "# Détection des ruptures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "78c3db70-e0b6-4de2-92ca-e29cf5bf6bd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ============================================================\n",
    "# AUM–FLOW CONSISTENCY & RUPTURE DETECTION (FINAL VERSION)\n",
    "# ============================================================\n",
    "# ------------------------------------------------------------\n",
    "# 1. Keep relevant columns\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean = stocks[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "flows_clean = flows[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Date formatting\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
    "flows_clean[\"Centralisation Date\"]  = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Aggregate flows per day\n",
    "# ------------------------------------------------------------\n",
    "flows_clean = (\n",
    "    flows_clean\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 4. Merge stocks and flows\n",
    "# ------------------------------------------------------------\n",
    "df = stocks_clean.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 5. Sort and compute expected stock\n",
    "# ------------------------------------------------------------\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"prev_stock\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df[\"prev_flows\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 6. Compute gaps\n",
    "# ------------------------------------------------------------\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "df[\"gap_rel\"] = df[\"gap_abs\"] / df[\"expected_stock\"].abs().clip(lower=1)\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 7. Detect ruptures (economic rule)\n",
    "# ------------------------------------------------------------\n",
    "TAU_ABS = 10.0     # minimum absolute gap (shares)\n",
    "TAU_REL = 0.005    # minimum relative gap (0.5%)\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_stock\"].notna()\n",
    "    & (df[\"gap_abs\"] > TAU_ABS)\n",
    "    & (df[\"gap_rel\"] > TAU_REL)\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 8. Remove end-of-sample false positives (edge effects)\n",
    "# ------------------------------------------------------------\n",
    "last_date = df[\"Centralisation Date\"].max()\n",
    "\n",
    "df[\"rupture_flag\"] = np.where(\n",
    "    (df[\"rupture_flag\"]) & (df[\"Centralisation Date\"] == last_date),\n",
    "    False,\n",
    "    df[\"rupture_flag\"]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "a9783dc1-e225-4142-8b6f-6f9e620b4b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 9. ISIN-level summary (AFTER CLEANING)\n",
    "# ------------------------------------------------------------\n",
    "rupture_isin_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        total_obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap_abs\", \"max\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 10. Account-level summary (AFTER CLEANING)\n",
    "# ------------------------------------------------------------\n",
    "rupture_summary = (\n",
    "    df.groupby(\"Registrar Account - ID\")\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "        total_obs=(\"rupture_flag\", \"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "        max_gap=(\"gap_abs\", \"max\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 11. Outputs\n",
    "# ------------------------------------------------------------\n",
    "df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
    "rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
    "rupture_summary.to_csv(\"rupture_summary.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "f5b62558-c27a-4428-a193-8b97e0ce6b6a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "hole": 0.45,
         "hoverinfo": "label+percent",
         "labels": [
          "Clean / quasi-clean (≤1%)",
          "Moderate (1–10%)",
          "High (10–30%)",
          "Severe (>30%)"
         ],
         "textinfo": "percent",
         "type": "pie",
         "values": {
          "bdata": "AAAAAACASEAAAAAAAIBBQAAAAAAAAChAZmZmZmZmEEA=",
          "dtype": "f8"
         }
        }
       ],
       "layout": {
        "legend": {
         "orientation": "h",
         "title": {
          "text": "Rupture ratio"
         },
         "x": 0.5,
         "xanchor": "center",
         "y": -0.15,
         "yanchor": "top"
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermap": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermap"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        }
       }
      },
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAFoCAYAAADQNY2xAAAQAElEQVR4AezdB5xU1d3/8d+UBZalCCigYAF779gVe3lsUSGxRRProz722BOJGiyJJhpji0nMo8a/GhM1Ro2aaB5jNxgL9oZSBAFBlt1ld2fmP98Ld7077OzO7LRbPrw4O3fuPefcc97nzsz93TITz/APAQQQQAABBBBAAAEEECizQNz4hwACPhOgOQgggAACCCCAQPAFCDSCP4b0AAEEEECg0gLUjwACCCBQtACBRtFkFEAAAQQQQAABBBCotQDr978AgYb/x4gWIoAAAggggAACCCAQOAECjcANWakNpjwCCCCAAAIIIIAAApUXINCovDFrQAABBLoXYCkCCCCAAAIhFCDQCOGg0iUEEEAAAQQQKE2A0gggULoAgUbphtSAAAIIIIAAAggggAACOQJlDjRyaucpAggggAACCCCAAAIIRFKAQCOSw06nIyVAZxFAAAEEEEAAgRoIEGjUAJ1VIoAAAghEW4DeI4AAAlEQINCIwijTRwQQQAABBBBAAIHuBFhWAQECjQqgUiUCCCCAAAIIIIAAAlEXINCI+hZQav8pjwACCCCAAAIIIIBAFwIEGl2gMAsBBBAIsgBtRwABBBBAwA8CBBp+GAXagAACCCCAAAJhFqBvCERSgEAjksNOpxFAAAEEEEAAAQQQqKyAvwONyvad2hFAAAEEEEAAAQQQQKBCAgQaFYKlWgTCKkC/EEAAAQQQQACBQgQINApRIg8CCCCAAAL+FaBlCCCAgC8FCDR8OSw0CgEEEEAAAQQQQCC4ArRcAgQaUiAhgAACCCCAAAIIIIBAWQUINMrKSWWlClAeAQQQQAABBBBAIBwCBBrhGEd6gQACCFRKgHoRQAABBBDolQCBRq/YKIQAAggggAACCNRKgPUiEAwBAo1gjBOtRAABBBBAAAEEEEAgUAKRCjQCNTI0FgEEEEAAAQQQQACBAAsQaAR48Gg6AiEQoAsIIIAAAgggEFIBAo2QDizdQgABBBBAoHcClEIAAQTKI0CgUR5HakEAAQQQQAABBBBAoDICAa2VQCOgA0ezEUAAAQQQQAABBBDwswCBhp9Hh7aVKkB5BBBAAAEEEEAAgRoJEGjUCJ7VIoAAAtEUoNcIIIAAAlERINCIykjTTwQQQAABBBBAoCsB5iFQIQECjQrBUi0CCCCAAAIIIIAAAlEWINDo/ehTEgEEEEAAAQQQQAABBPIIEGjkgWE2AggEUYA2I4AAAggggIBfBAg0/DIStAMBBBBAAIEwCtAnBBCIrACBRmSHno4jgAACCCCAAAIIRFGgWn0m0KiWNOtBAAEEEEAAAQQQQCBCAgQaERpsulqqAOURQAABBBBAAAEEChUg0ChUinwIIIAAAv4ToEUIIIAAAr4VINDw7dDQMAQQQAABBBBAIHgCtBgBV4BAw5XgEQEEEEAAAQQQQAABBMomQKBRNspSK6I8AggggAACCCCAAALhESDQCM9Y0hMEECi3APUhgAACCCCAQK8FCDR6TUdBBBBAAAEEEKi2AOtDAIHgCBBoBGesaCkCCCCAAAIIIIAAAn4TyNseAo28NCxAAAEEEEAAAQQQQACB3goQaPRWjnIIlCpAeQQQQAABBBBAIMQCBBohHly6hgACCCBQnAC5EUAAAQTKJ0CgUT5LakIAAQQQQAABBBAorwC1BViAQCPAg0fTEUAAAQQQQAABBBDwqwCBhl9HptR2UR4BBBBAAAEEEEAAgRoKEGjUEJ9VI4BAtAToLQIIIIAAAlESINCI0mjTVwQQQAABBBDwCjCNAAIVFCDQqCAuVSOAAAIIIIAAAgggEFWB3gUaUdWi3wgggAACCCCAAAIIIFCQAIFGQUxkQsD/ArQQAQQQQAABBBDwkwCBhp9Gg7YggAACCIRJgL4ggAACkRYg0Ij08NN5BBBAAAEEEEAgSgL0tZoCBBrV1GZdCCCAAAIIIIAAAghERIBAIyIDXWo3KY8AAggggAACCCCAQDECBBrFaJEXAQQQ8I8ALUEAAQQQQMDXAgQavh4eGocAAggggAACwRGgpQgg4BUg0PBqMI0AAggggAACCCCAAAJlEfBFoFGWnlAJAggggAACCCCAAAII+EaAQMM3Q0FDEPCVAI1BAAEEEEAAAQRKEiDQKImPwggggAACCFRLgPUggAACwRIg0AjWeNFaBBBAAAEEEEAAAb8I0I5uBQg0uuVhIQIIIIAAAggggAACCPRGgECjN2qUKVWA8ggggAACCCCAAAIhFyDQCPkA0z0EEECgMAFyIYAAAgggUF4BAo3yelIbAggggAACCCBQHgFqQSDgAgQaAR9Amo8AAggggAACCCCAgB8Fwhho+NGZNiGAAAIIIIAAAgggECkBAo1IDTedRaBWAqwXAQQQQAABBKImQKARtRGnvwgggAACCEiAhAACCFRYgECjwsBUjwACCCCAAAIIIIBAIQJhy0OgEbYRpT8IIIAAAggggAACCPhAgEDDB4NAE0oVoDwCCCCAAAIIIICA3wQINPw2IrQHAQQQCIMAfUAAAQQQiLwAgUbkNwEAEEAAAQQQQCAKAvQRgWoLEGhUW5z1IYAAAggggAACCCAQAQECjR4HmQwIIIAAAggggAACCCBQrACBRrFi5EcAgdoL0AIEEEAAAQQQ8L0AgYbvh4gGIoAAAggg4H8BWogAAgjkChBo5IrwHAEEEEAAAQQQQACB4AvUvAcEGjUfAhqAAAIIIIAAAggggED4BAg0wjem9KhUAcojgAACCCCAAAIIlCxAoFEyIRUggAACCFRagPoRQAABBIInQKARvDGjxQgggAACCCCAQK0FWD8CPQoQaPRIRAYEEEAAAQQQQAABBBAoVoBAo1ixUvNTHgEEEEAAAQQQQACBCAgQaERgkOkiAgh0L8BSBBBAAAEEECi/AIFG+U2pEQEEEEAAAQRKE6A0AgiEQIBAIwSDSBcQQAABBBBAAAEEEKisQPG1E2gUb0YJBBBAAAEEEEAAAQQQ6EGAQKMHIBYjUKoA5RFAAAEEEEAAgSgKEGhEcdTpMwIIIJAVSM+Zae2vv2xtz//dWp940FoevNOa777Zmm77qS3+xY+s8Sdn26KLT7KvzzzCFp5wgC2YuKMdf0abnXVxm110RZtd/rN2u/ZX7XbTb9vtjj+k7N4HU/bXJ9L2witpe/+jjM2bn12JP//TKgQQQACBKggQaFQBmVUggAACNRdobrL2N16xlgfusMVXnWcLj9/fvj5tohNMLP7FpdZ0+8+s5Q+32pKH7rbWpx7KBh//cIKQ1AdTLT3zM8ss/KqjC4sazeZ8aTbt84y9837GpryesX+9lLYnn07bn/+ast/clbJrbmi383/c5gQm501qs6uvb7fb70zZQ4+l7M23M9ba2lEdEwgggICZgRBGAQKNMI4qfUIAgWgLpFKW+uhdW/K3P1nTjVc4ZyQWHLOXNV5xlrXce7u1TXneMl8vqJrR/GyM8sHHGXvx1bT95fG0XX9ru516XptdcW273f9Qyt6YmrGWJVVrDitCAAEEEKiSAIFGlaArtRrqRQABBCSgwGHJo/dZ46Wn2oLDd7FFFx5vzb+5zlr/73HnjITy+CllMmaffpaxv/0jbTfc1m6nZQOPy65ZGngoKPFTW2kLAggggEDvBAg0eudGKQQQQCCfQPXmtzRnA4m/WePkc2zhiQdZ8x03WPs7r1dv/WVe02czlgYeuszq/Elt9seHUzZ9ZjYiKfN6qA4BBBBAoDoCBBrVcWYtCCCAQHkEUinn0qfF10+yBcfvb003Xm7t/3nJLJ0qT/0+qWXeV2aP/z1tk65utx9ObrNH/pa2ufN80jiaEUABmowAArUQINCohTrrRAABBIoUSL33pjXffm32zMWBzs3cbc89ZdYajRsbZs02e/DRlF1wWZv95Np251utiuQjOwIIIIBADQS6DTRq0B5WiQACCCDgEWh75VlbdN6xtuiH/21LnvizZRYt9CyN3uQnn2Wcb7W64Mdt9o9n09baFj0DeowAAggERYBAIygjRTsRWCrA34gItE9
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Base\n",
    "rs = rupture_summary.copy()\n",
    "\n",
    "# Classes simplifiées\n",
    "bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rs[\"rupture_class\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# Distribution en %\n",
    "dist = (\n",
    "    rs[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# Donut chart\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=dist.index,\n",
    "        values=dist.values,\n",
    "        hole=0.45,\n",
    "        textinfo=\"percent\",\n",
    "        hoverinfo=\"label+percent\"\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    legend=dict(\n",
    "        orientation=\"h\",      # horizontale\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,              # en dessous du graphe\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e52cd650-df05-490d-af59-e66c058f955d",
   "metadata": {},
   "source": [
    "## AUM–FLOW CONSISTENCY & DISCONTINUITY DETECTION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "a7efe494-f5fa-43f8-8446-942fc2d3bd4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Detection threshold epsilon (trimmed 99th percentile): 40.03%\n"
     ]
    }
   ],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 1. Keep relevant columns\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean = stocks[\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - AUM\"]\n",
    "].copy()\n",
    "\n",
    "flows_clean = flows[\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\", \"Quantity - NetFlows\"]\n",
    "].copy()\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Date formatting\n",
    "# ------------------------------------------------------------\n",
    "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
    "flows_clean[\"Centralisation Date\"]  = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Aggregate flows per day\n",
    "# ------------------------------------------------------------\n",
    "flows_clean = (\n",
    "    flows_clean\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 4. Merge stocks and flows\n",
    "# ------------------------------------------------------------\n",
    "df = stocks_clean.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 5. Sort and reconstruct expected stock\n",
    "# ------------------------------------------------------------\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"prev_stock\"] = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      [\"Quantity - AUM\"]\n",
    "      .shift(1)\n",
    ")\n",
    "\n",
    "df[\"prev_flows\"] = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      [\"Quantity - NetFlows\"]\n",
    "      .shift(1)\n",
    "      .fillna(0)\n",
    ")\n",
    "\n",
    "df[\"expected_stock\"] = df[\"prev_stock\"] + df[\"prev_flows\"]\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 6. Compute accounting gaps\n",
    "# ------------------------------------------------------------\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "# Relative gap normalised by previous stock\n",
    "df[\"gap_rel\"] = (\n",
    "    df[\"gap_abs\"] /\n",
    "    df[\"prev_stock\"].abs().replace(0, np.nan)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 7. Calibration sample (valid regime)\n",
    "# ------------------------------------------------------------\n",
    "valid_gaps = df.loc[\n",
    "    df[\"gap_rel\"].notna() & (df[\"prev_stock\"] > 0),\n",
    "    \"gap_rel\"\n",
    "]\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 8. Robust, data-driven threshold (epsilon)\n",
    "# ------------------------------------------------------------\n",
    "# Step 1 — trim extreme breaks to avoid calibrating on resets\n",
    "gap_rel_trimmed = valid_gaps[\n",
    "    valid_gaps <= valid_gaps.quantile(0.90)\n",
    "]\n",
    "\n",
    "# Step 2 — define epsilon on the upper tail of the trimmed distribution\n",
    "EPSILON = gap_rel_trimmed.quantile(0.99)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 9. Detect discontinuities (diagnostic rule)\n",
    "# ------------------------------------------------------------\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_stock\"].notna()\n",
    "    & (df[\"prev_stock\"] > 0)\n",
    "    & (df[\"gap_rel\"] > EPSILON)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 10. Remove end-of-sample edge effects\n",
    "# ------------------------------------------------------------\n",
    "last_date = df[\"Centralisation Date\"].max()\n",
    "\n",
    "df.loc[\n",
    "    (df[\"rupture_flag\"]) &\n",
    "    (df[\"Centralisation Date\"] == last_date),\n",
    "    \"rupture_flag\"\n",
    "] = False\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 11. ISIN-level summary\n",
    "# ------------------------------------------------------------\n",
    "rupture_isin_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "      .agg(\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "          max_gap_abs=(\"gap_abs\", \"max\"),\n",
    "          max_gap_rel=(\"gap_rel\", \"max\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 12. Account-level summary\n",
    "# ------------------------------------------------------------\n",
    "rupture_summary = (\n",
    "    df.groupby(\"Registrar Account - ID\")\n",
    "      .agg(\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "          max_gap_abs=(\"gap_abs\", \"max\"),\n",
    "          max_gap_rel=(\"gap_rel\", \"max\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 13. Outputs\n",
    "# ------------------------------------------------------------\n",
    "df.to_csv(\"aum_flow_gaps.csv\", index=False)\n",
    "rupture_isin_summary.to_csv(\"rupture_isin_summary.csv\", index=False)\n",
    "rupture_summary.to_csv(\"rupture_summary.csv\", index=False)\n",
    "\n",
    "print(f\"Detection threshold epsilon (trimmed 99th percentile): {EPSILON:.2%}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "d7454212-1493-4715-a436-c331931f92fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Registrar Account - ID</th>\n",
       "      <th>Product - Isin</th>\n",
       "      <th>n_ruptures</th>\n",
       "      <th>total_obs</th>\n",
       "      <th>rupture_ratio</th>\n",
       "      <th>max_gap_abs</th>\n",
       "      <th>max_gap_rel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>59545</th>\n",
       "      <td>200127410</td>\n",
       "      <td>FR0010135103</td>\n",
       "      <td>384</td>\n",
       "      <td>436</td>\n",
       "      <td>0.880734</td>\n",
       "      <td>295985.42</td>\n",
       "      <td>3371.158214</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Registrar Account - ID Product - Isin  n_ruptures  total_obs  \\\n",
       "59545              200127410   FR0010135103         384        436   \n",
       "\n",
       "       rupture_ratio  max_gap_abs  max_gap_rel  \n",
       "59545       0.880734    295985.42  3371.158214  "
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rupture_isin_summary.sort_values(\"rupture_ratio\").head(1)\n",
    "rupture_isin_summary.sort_values(\"rupture_ratio\", ascending=False).head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "b4040847-e0cf-4aa5-966c-d1fbf3935b7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_isin_evolution(df, account_id, isin, title_suffix=\"\"):\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == account_id) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].copy()\n",
    "\n",
    "    if sub.empty:\n",
    "        print(\"No data for this (account, ISIN).\")\n",
    "        return\n",
    "\n",
    "    plt.figure(figsize=(10,4))\n",
    "\n",
    "    # Stock observé\n",
    "    plt.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"Quantity - AUM\"],\n",
    "        label=\"Observed stock\",\n",
    "        linewidth=2\n",
    "    )\n",
    "\n",
    "    # Stock attendu\n",
    "    plt.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"expected_stock\"],\n",
    "        label=\"Expected stock\",\n",
    "        linestyle=\"--\"\n",
    "    )\n",
    "\n",
    "    # Ruptures\n",
    "    rupt = sub[sub[\"rupture_flag\"]]\n",
    "    plt.scatter(\n",
    "        rupt[\"Centralisation Date\"],\n",
    "        rupt[\"Quantity - AUM\"],\n",
    "        color=\"red\",\n",
    "        label=\"Rupture\",\n",
    "        zorder=5\n",
    "    )\n",
    "\n",
    "    plt.title(f\"ISIN {isin} — Account {account_id} {title_suffix}\")\n",
    "    plt.xlabel(\"Date\")\n",
    "    plt.ylabel(\"AUM (shares)\")\n",
    "    plt.legend()\n",
    "    plt.grid(True)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "e5d7a5ab-40bd-452d-a6ae-d56e220c592f",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'plot_isin_dynamics' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[61]\u001b[39m\u001b[32m, line 63\u001b[39m\n\u001b[32m     58\u001b[39m     plt.show()\n\u001b[32m     62\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m _, row \u001b[38;5;129;01min\u001b[39;00m sample_isin.iterrows():\n\u001b[32m---> \u001b[39m\u001b[32m63\u001b[39m     \u001b[43mplot_isin_dynamics\u001b[49m(\n\u001b[32m     64\u001b[39m         df,\n\u001b[32m     65\u001b[39m         row[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m     66\u001b[39m         row[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     67\u001b[39m     )\n",
      "\u001b[31mNameError\u001b[39m: name 'plot_isin_dynamics' is not defined"
     ]
    }
   ],
   "source": [
    "# Option B (alternative) : les plus sévères\n",
    "# sample_isin = problematic_isin.sort_values(\n",
    "#     \"rupture_ratio\", ascending=False\n",
    "# ).head(10)\n",
    "\n",
    "sample_isin = rupture_isin_summary.sort_values(\n",
    "    \"rupture_ratio\",\n",
    "    ascending=False\n",
    ").head(10)\n",
    "\n",
    "def plot_isin_dynamics_clean(df, account_id, isin):\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == account_id) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].sort_values(\"Centralisation Date\")\n",
    "\n",
    "    if sub.empty:\n",
    "        return\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(7.5, 3))\n",
    "\n",
    "    # AUM observé\n",
    "    ax.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"Quantity - AUM\"],\n",
    "        label=\"Observed AUM\",\n",
    "        linewidth=2,\n",
    "        color=\"black\"\n",
    "    )\n",
    "\n",
    "    # AUM attendu\n",
    "    ax.plot(\n",
    "        sub[\"Centralisation Date\"],\n",
    "        sub[\"expected_stock\"],\n",
    "        label=\"Flow-implied AUM\",\n",
    "        linestyle=\"--\",\n",
    "        linewidth=2,\n",
    "        color=\"grey\"\n",
    "    )\n",
    "\n",
    "    # Ruptures\n",
    "    rupt = sub[sub[\"rupture_flag\"]]\n",
    "    ax.scatter(\n",
    "        rupt[\"Centralisation Date\"],\n",
    "        rupt[\"Quantity - AUM\"],\n",
    "        color=\"red\",\n",
    "        s=25,\n",
    "        zorder=5,\n",
    "        label=\"Discontinuity\"\n",
    "    )\n",
    "\n",
    "    ax.set_title(f\"Account {account_id} — ISIN {isin}\", fontsize=11)\n",
    "    ax.set_xlabel(\"\")\n",
    "    ax.set_ylabel(\"AUM (shares)\")\n",
    "    ax.legend(loc=\"best\")\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "\n",
    "for _, row in sample_isin.iterrows():\n",
    "    plot_isin_dynamics(\n",
    "        df,\n",
    "        row[\"Registrar Account - ID\"],\n",
    "        row[\"Product - Isin\"]\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "aef8ceb9-28a6-4908-ae24-a88d85b64309",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"Column(s) ['rupture_flag'] do not exist\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;66;03m# 1. Aggregate rupture rate over time\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      4\u001b[39m time_stats = (\n\u001b[32m      5\u001b[39m     \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43m      \u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      7\u001b[39m \u001b[43m          \u001b[49m\u001b[43mtotal_obs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcount\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[43m          \u001b[49m\u001b[43mn_ruptures\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrupture_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      9\u001b[39m \u001b[43m      \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     10\u001b[39m       .reset_index()\n\u001b[32m     11\u001b[39m )\n\u001b[32m     13\u001b[39m time_stats[\u001b[33m\"\u001b[39m\u001b[33mrupture_rate\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m     14\u001b[39m     time_stats[\u001b[33m\"\u001b[39m\u001b[33mn_ruptures\u001b[39m\u001b[33m\"\u001b[39m] / time_stats[\u001b[33m\"\u001b[39m\u001b[33mtotal_obs\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     15\u001b[39m )\n\u001b[32m     17\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     18\u001b[39m \u001b[38;5;66;03m# 2. Smooth (optional but recommended for readability)\u001b[39;00m\n\u001b[32m     19\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/groupby/generic.py:1432\u001b[39m, in \u001b[36mDataFrameGroupBy.aggregate\u001b[39m\u001b[34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[39m\n\u001b[32m   1429\u001b[39m     kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m] = engine_kwargs\n\u001b[32m   1431\u001b[39m op = GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args=args, kwargs=kwargs)\n\u001b[32m-> \u001b[39m\u001b[32m1432\u001b[39m result = \u001b[43mop\u001b[49m\u001b[43m.\u001b[49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1433\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1434\u001b[39m     \u001b[38;5;66;03m# GH #52849\u001b[39;00m\n\u001b[32m   1435\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.as_index \u001b[38;5;129;01mand\u001b[39;00m is_list_like(func):\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:190\u001b[39m, in \u001b[36mApply.agg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    187\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_str()\n\u001b[32m    189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(func):\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    191\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(func):\n\u001b[32m    192\u001b[39m     \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[32m    193\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.agg_list_like()\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:423\u001b[39m, in \u001b[36mApply.agg_dict_like\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    415\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magg_dict_like\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> DataFrame | Series:\n\u001b[32m    416\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    417\u001b[39m \u001b[33;03m    Compute aggregation in the case of a dict-like argument.\u001b[39;00m\n\u001b[32m    418\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m    421\u001b[39m \u001b[33;03m    Result of aggregation.\u001b[39;00m\n\u001b[32m    422\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m423\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43magg_or_apply_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43magg\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:1603\u001b[39m, in \u001b[36mGroupByApply.agg_or_apply_dict_like\u001b[39m\u001b[34m(self, op_name)\u001b[39m\n\u001b[32m   1598\u001b[39m     kwargs.update({\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m: engine, \u001b[33m\"\u001b[39m\u001b[33mengine_kwargs\u001b[39m\u001b[33m\"\u001b[39m: engine_kwargs})\n\u001b[32m   1600\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m com.temp_setattr(\n\u001b[32m   1601\u001b[39m     obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m, condition=\u001b[38;5;28mhasattr\u001b[39m(obj, \u001b[33m\"\u001b[39m\u001b[33mas_index\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   1602\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1603\u001b[39m     result_index, result_data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompute_dict_like\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1604\u001b[39m \u001b[43m        \u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m   1605\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1606\u001b[39m result = \u001b[38;5;28mself\u001b[39m.wrap_results_dict_like(selected_obj, result_index, result_data)\n\u001b[32m   1607\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:462\u001b[39m, in \u001b[36mApply.compute_dict_like\u001b[39m\u001b[34m(self, op_name, selected_obj, selection, kwargs)\u001b[39m\n\u001b[32m    460\u001b[39m is_groupby = \u001b[38;5;28misinstance\u001b[39m(obj, (DataFrameGroupBy, SeriesGroupBy))\n\u001b[32m    461\u001b[39m func = cast(AggFuncTypeDict, \u001b[38;5;28mself\u001b[39m.func)\n\u001b[32m--> \u001b[39m\u001b[32m462\u001b[39m func = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    464\u001b[39m is_non_unique_col = (\n\u001b[32m    465\u001b[39m     selected_obj.ndim == \u001b[32m2\u001b[39m\n\u001b[32m    466\u001b[39m     \u001b[38;5;129;01mand\u001b[39;00m selected_obj.columns.nunique() < \u001b[38;5;28mlen\u001b[39m(selected_obj.columns)\n\u001b[32m    467\u001b[39m )\n\u001b[32m    469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m selected_obj.ndim == \u001b[32m1\u001b[39m:\n\u001b[32m    470\u001b[39m     \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/apply.py:663\u001b[39m, in \u001b[36mApply.normalize_dictlike_arg\u001b[39m\u001b[34m(self, how, obj, func)\u001b[39m\n\u001b[32m    661\u001b[39m     cols = Index(\u001b[38;5;28mlist\u001b[39m(func.keys())).difference(obj.columns, sort=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m    662\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m663\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m do not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m    665\u001b[39m aggregator_types = (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[32m    667\u001b[39m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[32m    668\u001b[39m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[32m    669\u001b[39m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[32m    670\u001b[39m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
      "\u001b[31mKeyError\u001b[39m: \"Column(s) ['rupture_flag'] do not exist\""
     ]
    }
   ],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 1. Aggregate rupture rate over time\n",
    "# ------------------------------------------------------------\n",
    "time_stats = (\n",
    "    df.groupby(\"Centralisation Date\")\n",
    "      .agg(\n",
    "          total_obs=(\"rupture_flag\", \"count\"),\n",
    "          n_ruptures=(\"rupture_flag\", \"sum\")\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "\n",
    "time_stats[\"rupture_rate\"] = (\n",
    "    time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Smooth (optional but recommended for readability)\n",
    "# ------------------------------------------------------------\n",
    "time_stats[\"rupture_rate_ma\"] = (\n",
    "    time_stats[\"rupture_rate\"]\n",
    "    .rolling(window=6, center=True)   # 6 periods ≈ half-year\n",
    "    .mean()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Professional plot\n",
    "# ------------------------------------------------------------\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate\"] * 100,\n",
    "    color=\"lightgray\",\n",
    "    linewidth=1,\n",
    "    alpha=0.6,\n",
    "    label=\"Monthly rupture rate\"\n",
    ")\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate_ma\"] * 100,\n",
    "    color=\"#1f77b4\",\n",
    "    linewidth=2.5,\n",
    "    label=\"6-month moving average\"\n",
    ")\n",
    "\n",
    "plt.ylabel(\"Rupture rate (%)\")\n",
    "plt.xlabel(\"Date\")\n",
    "\n",
    "plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
    "plt.legend(frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "6624939b-f079-4e02-9989-60462e9f5356",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAHqCAYAAADVi/1VAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzsnXmcE+X9xz8zuZNNspvsAQvLIbcIiFwCKuAFHtT7pFXUetUDL7yq9WhrW+vtz1pbK1iLeFSxHlWrKKiIB8gth3LDwh45N3cy8/z+WGdIdpPdZDfZZJLv+/XixWYymTzP5PnOPM9nvgfHGGMgCIIgCIIgCIIgCIIgiB6Ez3cDCIIgCIIgCIIgCIIgiNKDRCmCIAiCIAiCIAiCIAiixyFRiiAIgiAIgiAIgiAIguhxSJQiCIIgCIIgCIIgCIIgehwSpQiCIAiCIAiCIAiCIIgeh0QpgiAIgiAIgiAIgiAIoschUYogCIIgCIIgCIIgCILocUiUIgiCIAiCIAiCIAiCIHocdb4bkGtEUUR9fT3MZjM4jst3cwiCIAiCIAiCIAiCIIoaxhhaWlpQW1sLnk/tD1X0olR9fT3q6ury3QyCIAiCIAiCIAiCIIiSYu/evejbt2/K94telDKbzQBaT4TFYslza4hcI4oi9u7di7q6ug7VWIIoVsgGiFKHbIAodcgGiFKGxj9R6hSSDXi9XtTV1cmaTCqKXpSSQvYsFguJUiWAKIowm82wWCx5N0KCyAdkA0SpQzZAlDpkA0QpQ+OfKHUK0QY6S6NUGK0kiCzBcRzKy8spfxhRspANEKUO2QBR6pANEKUMjX+i1FGiDRS9pxRRWkhGSBClCtkAUeqQDRClDtkAUcrQ+CdKHSXaAHlKEUWFKIpoaGiAKIr5bgpB5AWyAaLUIRsgSh2yAaKUofFPlDpKtAESpYiiIxgM5rsJBJFXyAaIUodsgCh1yAaIUobGP1HqKM0GSJQiCIIgCIIgCIIgCIIgehwSpQiCIAiCIAiCIAiCIIgeh0QpoqjgOA52u11R1QYIIpuQDRClDtkAUeqQDRClDI1/otRRog1Q9T2iqOA4DmazOd/NIIi8QTZAlDpkA0SpQzZAlDI0/olSR4k2QJ5SRFEhiiL279+vqGoDBJFNyAaIUodsgCh1yAaIUobGP1HqKNEGSJQiio5oNJrvJhBEXiEbIEodsgGi1CEbIEoZGv9EqaM0GyBRiiAIgiAIgiAIgsg5HMfhrbfeSvn+smXLwHEc3G53j7WJ6BpLly7FiBEjIAhCTr9n+vTpuOmmm3L6HYwxXHXVVbDZbOA4DmvXru2R7+1JMrWt5uZmVFdXY9++fbltGEiUIgiCIAiCIAiCKFrmzp0LjuNwzTXXtHvvuuuuA8dxmDt3bla/8/7778eRRx6Z1WMWAgsXLkR5eXm+m9Elst3222+/Hffccw9UKlVWjpdKNHnzzTfx29/+NivfkYoPPvgACxcuxLvvvosDBw7giCOOyOn3KYHKykpccskluO+++3L+XSRKEUUFx3GoqalRVLUBgsgmZANEqUM2QJQ6ZANEMurq6vDKK68gGAzK20KhEF5++WX069cvjy3LLsnGfyQSyWOLUpOtEKt89O+LL77A9u3bcc455+T8u2w2W84Td2/fvh29e/fGlClT0KtXL6jVyq0Hl817wGWXXYZFixbB6XRmoWWpIVGKKCo4joPBYKCJGFGykA0QpQ7ZAFHqkA0QyTjqqKNQV1eHN998U9725ptvol+/fhg7dmzCvuFwGDfeeCOqq6uh1+txzDHH4Ntvv5Xflzxali5divHjx8NoNGLKlCnYunUrgFaPnAceeADr1q0Dx3HgOA4LFy6UP9/c3IyzzjoLRqMRQ4YMwdtvv520zX6/HxaLBf/+978Ttr/11lswmUxoaWlp9xmO43DKKafghhtuwE033YTKykrMnDkTu3btksOyJNxuNziOw7JlyxL69d5772H06NHQ6/U4+uijsXHjRvn9yy67DB6PR+7X/fffL39v27DE8vJyud/S97/66quYNm0a9Ho9Fi1aBAB4/vnnMWLECOj1egwfPhx/+ctfkp4PienTp+P6669P6B8APPbYYxg1ahRMJhPq6urwq1/9Cj6fr9O2h8Nh3HbbbejTpw9MJhMmTZokn5NUvPLKKzjppJOg1+vlbdu3b8cZZ5yBmpoalJWVYcKECfj4448TPhcOh3HHHXegrq4OOp0OgwcPxj/+8Q/s2rULM2bMAABUVFQkeO/Fh9HdfffdmDRpUrv2jBkzBg8++KD8OpNzOnfuXNxwww3Ys2cPOI7DgAEDku7ncrlwySWXoKKiAkajEaeccgp++OEHAK3hf1VVVQlj9cgjj0Tv3r3l11988QV0Oh0CgQAYY7j//vvRr18/6HQ61NbW4sYbb0zZRgB49tlnMWjQIGi1WgwbNgwvvfRSwvscx+H555/H2WefDbvdjqFDh3bbtkaOHIna2losWbKkw7Z1FxKliKJCFEXs3r1bUdUGCCKbkA0QpQ7ZAFHqkA0Qqbj88suxYMEC+fULL7yAyy67rN1+t99+O9544w28+OKL+O677zB48GDMnDmznbfEr3/9azz66KNYtWoV1Go1Lr/8cgDABRdcgFtvvRUjR47EgQMHcODAAVxwwQXy5x544AGcf/75WL9+PU499VTMmTMnqSeGyWTChRdemNBmAFiwYAHOPffcpN4zoigiFArhxRdfhFarxYoVK/DXv/41o/M0f/58PProo/j2229RVVWF2bNnIxqNYsqUKXjiiSdgsVjkft12220ZHfvOO+/EvHnzsHnzZsycOROLFi3Cb37zG/z+97/H5s2b8dBDD+Hee+/Fiy++2OFxkvWP53k89dRT2LRpE1588UV88sknuP322wGgw7Zff/31WLlyJV555RWsX78e5513HmbNmiULLsn4/PPPMX78+IRtPp8Pp556KpYuXYo1a9Zg1qxZmD17Nvbs2SPvc8kll2Dx4sV46qmnsHnzZjz33HMoKytDXV0d3njjDQDA1q1bceDAATz55JPtvnfOnDn45ptvsH37dnnbpk2bsH79elx88cUAkPE5ffLJJ/Hggw+ib9++OHDgQIIAG8/cuXOxatUqvP3221i5ciUYYzj11FMRjUbBcRyOO+44WcxzuVzYvHkzgsEgtmzZAgBYvnw5JkyYAKPRiDfeeAOPP/44nnvuOfzwww946623MGrUqJTne8mSJZg3bx5uvfVWbNy4EVdffTUuu+wyfPrppwn7PfDAAzj33HPx/vvv45RTTsmKbU2cOBGff/55yrZlBVbkeDweBoB5PJ58N6XbhMNhFo1G892MgkYQBLZz504mCEK+m0IQeYFsgCh1yAaIUodsoOcRBIGFw+Ee/ZfJ73vppZeyM844gzU2NjKdTsd27drFdu3axfR6PWtqamJnnHEGu/TSSxljjPl8PqbRaNiiRYvkz0ciEVZbW8sefvhhxhhjn376KQPAPv74Y3mf9957jwFgwWCQMcbYfffdx8aMGdOuLQDYPffcI7/2+XwMAHv//fcTju1yuRhjjH399ddMpVKx+vp6xhhjDQ0NTK1Ws2XLlqX8LSZNmsTGjh2bsH3nzp0MAFuzZo28zeVyMQDs008/TfjuV155Rd7H4XAwg8HAXn31VcYYYwsWLGBWqzVpv5YsWZKwzWq1sgULFiR8/xNPPJGwz6BBg9jLL7+csO23v/0tmzx5ctL+McbYtGnT2vUvGa+//jqz2+3y62Rt3717N1OpVGz//v0J20844QR21113pTy21Wpl//znPzttw8iRI9nTTz/NGGNs69atDAD76KOPku7b9reXmDZtGps3b578esyYMezBBx+UX991111s0qRJ8uuunNPHH3+c9e/fP+X3btu2jQFgK1askN9vbm5mBoOBvfbaa4wxxp566ik2cuRIxhhjb731Fps0aRI744wz2LPPPssYY+zEE09kd999N2OMsUcffZQNHTqURSKRlG2KZ8qUKezKK69M2HbeeeexU089VX4t2ZZ0D/B6vVmxrZtvvplNnz49rXa2JV0tRrnBkiU
      "text/plain": [
       "<Figure size 1200x500 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ------------------------------------------------------------\n",
    "# 1. Aggregate rupture rate over time\n",
    "#    CORRECTED: denominator = only observations with active flows\n",
    "# ------------------------------------------------------------\n",
    "\n",
    "# Active observations = at least one non-zero flow at this date\n",
    "active_obs = df[df[\"Quantity - NetFlows\"] != 0]\n",
    "\n",
    "time_stats = (\n",
    "    active_obs\n",
    "    .groupby(\"Centralisation Date\")\n",
    "    .agg(\n",
    "        total_obs=(\"rupture_flag\", \"count\"),   # only active flow observations\n",
    "        n_ruptures=(\"rupture_flag\", \"sum\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "time_stats[\"rupture_rate\"] = (\n",
    "    time_stats[\"n_ruptures\"] / time_stats[\"total_obs\"]\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Smooth (6-month moving average)\n",
    "# ------------------------------------------------------------\n",
    "time_stats[\"rupture_rate_ma\"] = (\n",
    "    time_stats[\"rupture_rate\"]\n",
    "    .rolling(window=6, center=True)\n",
    "    .mean()\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Plot\n",
    "# ------------------------------------------------------------\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate\"] * 100,\n",
    "    color=\"lightgray\",\n",
    "    linewidth=1,\n",
    "    alpha=0.6,\n",
    "    label=\"Monthly rupture rate (active flows only)\"\n",
    ")\n",
    "\n",
    "plt.plot(\n",
    "    time_stats[\"Centralisation Date\"],\n",
    "    time_stats[\"rupture_rate_ma\"] * 100,\n",
    "    color=\"#1f77b4\",\n",
    "    linewidth=2.5,\n",
    "    label=\"6-month moving average\"\n",
    ")\n",
    "\n",
    "plt.ylabel(\"Rupture rate (%)\")\n",
    "plt.xlabel(\"Date\")\n",
    "plt.grid(True, linestyle=\"--\", alpha=0.4)\n",
    "plt.legend(frameon=False)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d6ee0c24-e14e-4c40-97d4-49879229790c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1047489516.py:6: FutureWarning:\n",
      "\n",
      "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "has_reset\n",
       "True     64192\n",
       "False    15545\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "EPS = 1e-6  # seuil numérique\n",
    "\n",
    "reset_candidates = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .apply(\n",
    "        lambda g: (\n",
    "            (g[\"Quantity - AUM\"].abs() < EPS) &\n",
    "            (g[\"expected_stock\"].abs() < EPS)\n",
    "        ).any()\n",
    "    )\n",
    "    .reset_index(name=\"has_reset\")\n",
    ")\n",
    "\n",
    "reset_candidates[\"has_reset\"].value_counts()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "601f61b8-0115-431d-97de-6ec5a0f1d4f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Before repair  After repair  Repaired points\n",
      "0         756392         22357            18440\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/3061846510.py:66: FutureWarning:\n",
      "\n",
      "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "GAP_TOL = 1e-6\n",
    "REL_GAP_THR = 0.05\n",
    "MIN_PERSISTENCE = 3\n",
    "\n",
    "df = merged_isin.copy().sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "df[\"corrected_aum\"] = df[\"Quantity - AUM\"]\n",
    "df[\"repair_flag\"] = False\n",
    "\n",
    "def repair_group(g):\n",
    "    g = g.copy()\n",
    "\n",
    "    obs = g[\"Quantity - AUM\"].values\n",
    "    flows = g[\"Quantity - NetFlows\"].values\n",
    "\n",
    "    corrected = obs.copy()\n",
    "\n",
    "    # Initial expected path\n",
    "    expected = np.empty_like(obs)\n",
    "    expected[0] = np.nan\n",
    "\n",
    "    for t in range(1, len(obs)):\n",
    "        expected[t] = corrected[t-1] + flows[t-1]\n",
    "\n",
    "    gap = obs - expected\n",
    "    rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
    "\n",
    "    idx = None\n",
    "\n",
    "    for i in range(1, len(obs) - MIN_PERSISTENCE):\n",
    "        if (\n",
    "            rel_gap[i] > REL_GAP_THR\n",
    "            and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
    "            and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
    "        ):\n",
    "            idx = i\n",
    "            break\n",
    "\n",
    "    if idx is None:\n",
    "        return g\n",
    "\n",
    "    # Apply correction\n",
    "    shift = gap[idx]\n",
    "    corrected[idx:] = obs[idx:] - shift\n",
    "\n",
    "    g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
    "\n",
    "    # Rebuild expected stock AFTER correction\n",
    "    expected_corr = np.empty_like(obs)\n",
    "    expected_corr[0] = np.nan\n",
    "\n",
    "    for t in range(1, len(obs)):\n",
    "        expected_corr[t] = corrected[t-1] + flows[t-1]\n",
    "\n",
    "    g[\"corrected_aum\"] = corrected\n",
    "    g[\"expected_stock_corr\"] = expected_corr\n",
    "\n",
    "    return g\n",
    "\n",
    "\n",
    "df = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"], group_keys=False)\n",
    "    .apply(repair_group)\n",
    ")\n",
    "\n",
    "# Recompute gaps & ruptures\n",
    "df[\"gap_before\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
    "df[\"gap_after\"] = df[\"corrected_aum\"] - df[\"expected_stock_corr\"]\n",
    "\n",
    "df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_TOL\n",
    "df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_TOL\n",
    "\n",
    "summary = pd.DataFrame({\n",
    "    \"Before repair\": [df[\"rupture_before\"].sum()],\n",
    "    \"After repair\": [df[\"rupture_after\"].sum()],\n",
    "    \"Repaired points\": [df[\"repair_flag\"].sum()]\n",
    "})\n",
    "\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "62583cfe-a6e7-4931-a63e-4273dca97ff7",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_final' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplotly\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgraph_objects\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mgo\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df_final = \u001b[43mdf_final\u001b[49m.rename(columns={\n\u001b[32m      5\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      6\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      7\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      8\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m      9\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     10\u001b[39m })\n\u001b[32m     12\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     13\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m]\n",
      "\u001b[31mNameError\u001b[39m: name 'df_final' is not defined"
     ]
    }
   ],
   "source": [
    "import plotly.graph_objects as go\n",
    "import pandas as pd\n",
    "\n",
    "# ============================================================\n",
    "# Parameters (fixed epsilon)\n",
    "# ============================================================\n",
    "GAP_EPS = 100   # fixed tolerance for accounting identity\n",
    "\n",
    "# ============================================================\n",
    "# 1. Define ruptures using a FIXED epsilon\n",
    "# ============================================================\n",
    "df = df.copy()\n",
    "\n",
    "df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_EPS\n",
    "df[\"rupture_after\"]  = df[\"gap_after\"].abs()  > GAP_EPS\n",
    "\n",
    "# ============================================================\n",
    "# 2. Rupture ratios BEFORE repair\n",
    "# ============================================================\n",
    "rupture_summary_before = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_obs=(\"rupture_before\", \"count\"),\n",
    "        n_ruptures=(\"rupture_before\", \"sum\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "rupture_summary_before[\"rupture_ratio\"] = (\n",
    "    rupture_summary_before[\"n_ruptures\"] /\n",
    "    rupture_summary_before[\"n_obs\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 3. Rupture ratios AFTER repair\n",
    "# ============================================================\n",
    "rupture_summary_after = (\n",
    "    df\n",
    "    .groupby([\"Registrar Account - ID\", \"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_obs=(\"rupture_after\", \"count\"),\n",
    "        n_ruptures=(\"rupture_after\", \"sum\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "rupture_summary_after[\"rupture_ratio\"] = (\n",
    "    rupture_summary_after[\"n_ruptures\"] /\n",
    "    rupture_summary_after[\"n_obs\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 4. Rupture intensity classes (fixed bins)\n",
    "# ============================================================\n",
    "bins = [0.0, 0.01, 0.10, 0.30, 1.0]\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rupture_summary_before[\"rupture_class\"] = pd.cut(\n",
    "    rupture_summary_before[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "rupture_summary_after[\"rupture_class\"] = pd.cut(\n",
    "    rupture_summary_after[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 5. Distribution (%)\n",
    "# ============================================================\n",
    "dist_before = (\n",
    "    rupture_summary_before[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "dist_after = (\n",
    "    rupture_summary_after[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# ============================================================\n",
    "# 6. Donut chart: BEFORE vs AFTER (fixed epsilon)\n",
    "# ============================================================\n",
    "fig = go.Figure()\n",
    "\n",
    "fig.add_trace(go.Pie(\n",
    "    labels=dist_before.index,\n",
    "    values=dist_before.values,\n",
    "    hole=0.45,\n",
    "    name=\"Before repair\",\n",
    "    domain=dict(x=[0.0, 0.48]),\n",
    "    textinfo=\"percent\",\n",
    "    hoverinfo=\"label+percent\"\n",
    "))\n",
    "\n",
    "fig.add_trace(go.Pie(\n",
    "    labels=dist_after.index,\n",
    "    values=dist_after.values,\n",
    "    hole=0.45,\n",
    "    name=\"After repair\",\n",
    "    domain=dict(x=[0.52, 1.0]),\n",
    "    textinfo=\"percent\",\n",
    "    hoverinfo=\"label+percent\"\n",
    "))\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Distribution of AUM–flow rupture intensity before vs after repair (fixed ε)\",\n",
    "    annotations=[\n",
    "        dict(text=\"Before repair\", x=0.24, y=0.5, showarrow=False),\n",
    "        dict(text=\"After repair\",  x=0.76, y=0.5, showarrow=False),\n",
    "    ],\n",
    "    legend=dict(\n",
    "        orientation=\"h\",\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "70cf0a99-bd19-41a9-9574-88647fde09ca",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[31]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m      5\u001b[39m df_final = df.copy()\n\u001b[32m      7\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m      8\u001b[39m \u001b[38;5;66;03m# Core variables (before / after)\u001b[39;00m\n\u001b[32m      9\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m df_final = \u001b[43mdf_final\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m     11\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     12\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     13\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     14\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - AUM\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     15\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     16\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mQuantity - NetFlows\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     17\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     18\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock_corr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     19\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_before\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     20\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgap_after\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     21\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m     22\u001b[39m \u001b[43m]\u001b[49m\u001b[43m]\u001b[49m.rename(columns={\n\u001b[32m     23\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     24\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33maum_repaired\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     25\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mQuantity - NetFlows\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mflows\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     26\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_raw\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     27\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mexpected_stock_corr\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mexpected_aum_repaired\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     28\u001b[39m })\n\u001b[32m     30\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     31\u001b[39m \u001b[38;5;66;03m# Relative gaps\u001b[39;00m\n\u001b[32m     32\u001b[39m \u001b[38;5;66;03m# ------------------------------------------------------------\u001b[39;00m\n\u001b[32m     33\u001b[39m df_final[\u001b[33m\"\u001b[39m\u001b[33mgap_rel_before\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m     34\u001b[39m     df_final[\u001b[33m\"\u001b[39m\u0
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m   4117\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m   4118\u001b[39m         key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m     indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m   4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m   4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m   6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   6210\u001b[39m     keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m   6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m   6216\u001b[39m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m   6261\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mKeyError\u001b[39m: \"['Quantity - AUM', 'corrected_aum', 'Quantity - NetFlows', 'expected_stock', 'expected_stock_corr', 'gap_before', 'gap_after'] not in index\""
     ]
    }
   ],
   "source": [
    "# ============================================================\n",
    "# FINAL DATASETS AFTER REPAIR\n",
    "# ============================================================\n",
    "\n",
    "df_final = df.copy()\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Core variables (before / after)\n",
    "# ------------------------------------------------------------\n",
    "df_final = df_final[[\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\",\n",
    "    \"Quantity - AUM\",\n",
    "    \"corrected_aum\",\n",
    "    \"Quantity - NetFlows\",\n",
    "    \"expected_stock\",\n",
    "    \"expected_stock_corr\",\n",
    "    \"gap_before\",\n",
    "    \"gap_after\",\n",
    "    \"repair_flag\"\n",
    "]].rename(columns={\n",
    "    \"Quantity - AUM\": \"aum_raw\",\n",
    "    \"corrected_aum\": \"aum_repaired\",\n",
    "    \"Quantity - NetFlows\": \"flows\",\n",
    "    \"expected_stock\": \"expected_aum_raw\",\n",
    "    \"expected_stock_corr\": \"expected_aum_repaired\"\n",
    "})\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Relative gaps\n",
    "# ------------------------------------------------------------\n",
    "df_final[\"gap_rel_before\"] = (\n",
    "    df_final[\"gap_before\"].abs() /\n",
    "    df_final[\"expected_aum_raw\"].abs().clip(lower=1)\n",
    ")\n",
    "\n",
    "df_final[\"gap_rel_after\"] = (\n",
    "    df_final[\"gap_after\"].abs() /\n",
    "    df_final[\"expected_aum_repaired\"].abs().clip(lower=1)\n",
    ")\n",
    "df_final.to_csv('df_repaired.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "befb2962-73fb-4cb8-b86e-3218ec103204",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ============================================================\n",
    "# TYPE 3 REPAIR — TEMPORARY RESET TO ZERO (ONE BLOCK)\n",
    "# ============================================================\n",
    "\n",
    "df_type3 = df_repaired.copy()\n",
    "df_type3 = df_type3.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# Create lead/lag variables\n",
    "df_type3[\"aum_prev\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df_type3[\"aum_next\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(-1)\n",
    "\n",
    "df_type3[\"flow_prev\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1)\n",
    "\n",
    "df_type3[\"flow_next\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(-1)\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Detection of temporary reset\n",
    "# ------------------------------------------------------------\n",
    "df_type3[\"type3_flag\"] = (\n",
    "    (df_type3[\"Quantity - AUM\"] == 0)\n",
    "    & (df_type3[\"aum_prev\"] > 0)\n",
    "    & (df_type3[\"aum_next\"] == df_type3[\"aum_prev\"])\n",
    "    & (df_type3[\"flow_prev\"].fillna(0) == 0)\n",
    "    & (df_type3[\"Quantity - NetFlows\"] == 0)\n",
    "    & (df_type3[\"flow_next\"].fillna(0) == 0)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Repair: smooth the glitch (replace 0 by previous stock)\n",
    "# ------------------------------------------------------------\n",
    "df_type3.loc[df_type3[\"type3_flag\"], \"Quantity - AUM\"] = (\n",
    "    df_type3.loc[df_type3[\"type3_flag\"], \"aum_prev\"]\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Recompute temporal chain AFTER repair\n",
    "# ------------------------------------------------------------\n",
    "df_type3[\"prev_stock\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df_type3[\"prev_flows\"] = df_type3.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df_type3[\"expected_stock\"] = (\n",
    "    df_type3[\"prev_stock\"] + df_type3[\"prev_flows\"]\n",
    ")\n",
    "\n",
    "df_type3[\"gap\"] = df_type3[\"Quantity - AUM\"] - df_type3[\"expected_stock\"]\n",
    "df_type3[\"gap_abs\"] = df_type3[\"gap\"].abs()\n",
    "df_type3[\"gap_rel\"] = (\n",
    "    df_type3[\"gap_abs\"] /\n",
    "    df_type3[\"expected_stock\"].abs().clip(lower=1)\n",
    ")\n",
    "\n",
    "df_type3[\"rupture_flag\"] = (\n",
    "    df_type3[\"prev_stock\"].notna()\n",
    "    & (df_type3[\"gap_abs\"] > TAU_ABS)\n",
    "    & (df_type3[\"gap_rel\"] > TAU_REL)\n",
    ")\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# Diagnostic output\n",
    "# ------------------------------------------------------------\n",
    "n_type3 = df_type3[\"type3_flag\"].sum()\n",
    "print(f\"Temporary reset glitches repaired (Type 3): {n_type3}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fc44ed4-829f-4a8a-985a-31350bdbdf6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 1. Sélection des ISIN avec exactement 1 rupture\n",
    "# ------------------------------------------------------------\n",
    "one_rupture_isin = rupture_isin_summary[\n",
    "    rupture_isin_summary[\"n_ruptures\"] == 1\n",
    "][[\"Registrar Account - ID\", \"Product - Isin\"]].head(100)\n",
    "\n",
    "results = []\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 2. Boucle de correction test\n",
    "# ------------------------------------------------------------\n",
    "for _, row in one_rupture_isin.iterrows():\n",
    "    acc = row[\"Registrar Account - ID\"]\n",
    "    isin = row[\"Product - Isin\"]\n",
    "\n",
    "    sub = df[\n",
    "        (df[\"Registrar Account - ID\"] == acc) &\n",
    "        (df[\"Product - Isin\"] == isin)\n",
    "    ].sort_values(\"Centralisation Date\").copy()\n",
    "\n",
    "    # Localiser la rupture\n",
    "    rupture_idx = sub.index[sub[\"rupture_flag\"]]\n",
    "\n",
    "    if sub.index.get_loc(rupture_idx[0]) > 1:\n",
    "        #print(sub[[\"Centralisation Date\", \"Quantity - AUM\", \"expected_stock\", \"gap\", \"rupture_flag\"]].head(100))\n",
    "        continue\n",
    "\n",
    "    # Vérifier si la rupture est à la première date\n",
    "    first_idx = sub.index[0]\n",
    "    if rupture_idx[0] != first_idx:\n",
    "        continue\n",
    "\n",
    "    # ----- Réparation : décaler expected_stock -----\n",
    "    sub[\"expected_stock_fixed\"] = sub[\"expected_stock\"].shift(-1)\n",
    "\n",
    "    # Recalcul des gaps\n",
    "    sub[\"gap_fixed\"] = sub[\"Quantity - AUM\"] - sub[\"expected_stock_fixed\"]\n",
    "    sub[\"gap_abs_fixed\"] = sub[\"gap_fixed\"].abs()\n",
    "    sub[\"gap_rel_fixed\"] = sub[\"gap_abs_fixed\"] / sub[\"expected_stock_fixed\"].abs().clip(lower=1)\n",
    "\n",
    "    # Recalcul rupture\n",
    "    sub[\"rupture_fixed\"] = (\n",
    "        sub[\"expected_stock_fixed\"].notna()\n",
    "        & (sub[\"gap_abs_fixed\"] > TAU_ABS)\n",
    "        & (sub[\"gap_rel_fixed\"] > TAU_REL)\n",
    "    )\n",
    "\n",
    "    results.append({\n",
    "        \"Registrar Account - ID\": acc,\n",
    "        \"Product - Isin\": isin,\n",
    "        \"ruptures_before\": sub[\"rupture_flag\"].sum(),\n",
    "        \"ruptures_after\": sub[\"rupture_fixed\"].sum()\n",
    "    })\n",
    "\n",
    "# ------------------------------------------------------------\n",
    "# 3. Résultats agrégés\n",
    "# ------------------------------------------------------------\n",
    "repair_test = pd.DataFrame(results)\n",
    "\n",
    "summary = repair_test.groupby(\n",
    "    [\"ruptures_before\", \"ruptures_after\"]\n",
    ").size().reset_index(name=\"count\")\n",
    "\n",
    "repair_test, summary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d85728ca-55ba-4266-b881-23536eee4ba3",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"['corrected_aum'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[50]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m     10\u001b[39m stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m] = pd.to_datetime(\n\u001b[32m     11\u001b[39m     stocks_repaired[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m     12\u001b[39m )\n\u001b[32m     14\u001b[39m \u001b[38;5;66;03m# 2. Build repair map\u001b[39;00m\n\u001b[32m     15\u001b[39m repair_map = (\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m     \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m     17\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mRegistrar Account - ID\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     18\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mProduct - Isin\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     19\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     20\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcorrected_aum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m     21\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrepair_flag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m     22\u001b[39m \u001b[43m    \u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m     23\u001b[39m     .rename(columns={\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM repaired\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m     24\u001b[39m )\n\u001b[32m     26\u001b[39m \u001b[38;5;66;03m# 3. Merge repaired quantities\u001b[39;00m\n\u001b[32m     27\u001b[39m stocks_repaired = stocks_repaired.merge(\n\u001b[32m     28\u001b[39m     repair_map,\n\u001b[32m     29\u001b[39m     on=[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m     30\u001b[39m     how=\u001b[33m\"\u001b[39m\u001b[33mleft\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     31\u001b[39m )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4119\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m   4117\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m   4118\u001b[39m         key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4119\u001b[39m     indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m   4121\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m   4122\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m   6209\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   6210\u001b[39m     keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6212\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   6214\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m   6215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m   6216\u001b[39m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:6264\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m   6261\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m   6263\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6264\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mKeyError\u001b[39m: \"['corrected_aum'] not in index\""
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# ============================================================\n",
    "# Rebuild STOCKS dataset using repaired AUM quantities\n",
    "# ============================================================\n",
    "\n",
    "# 1. Copy original stocks\n",
    "stocks_repaired = stocks.copy()\n",
    "stocks_repaired[\"Centralisation Date\"] = pd.to_datetime(\n",
    "    stocks_repaired[\"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# 2. Build repair map\n",
    "repair_map = (\n",
    "    df[[\n",
    "        \"Registrar Account - ID\",\n",
    "        \"Product - Isin\",\n",
    "        \"Centralisation Date\",\n",
    "        \"corrected_aum\",\n",
    "        \"repair_flag\"\n",
    "    ]]\n",
    "    .rename(columns={\"corrected_aum\": \"Quantity - AUM repaired\"})\n",
    ")\n",
    "\n",
    "# 3. Merge repaired quantities\n",
    "stocks_repaired = stocks_repaired.merge(\n",
    "    repair_map,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "# 4. Store original quantity\n",
    "stocks_repaired[\"Quantity - AUM original\"] = stocks_repaired[\"Quantity - AUM\"]\n",
    "\n",
    "# 5. Replace Quantity - AUM where repaired\n",
    "stocks_repaired[\"Quantity - AUM\"] = np.where(\n",
    "    stocks_repaired[\"repair_flag\"] == True,\n",
    "    stocks_repaired[\"Quantity - AUM repaired\"],\n",
    "    stocks_repaired[\"Quantity - AUM\"]\n",
    ")\n",
    "\n",
    "# 6. Recompute monetary values (unit value unchanged)\n",
    "stocks_repaired[\"nav_ccy\"] = (\n",
    "    stocks_repaired[\"Value - AUM CCY\"] /\n",
    "    stocks_repaired[\"Quantity - AUM original\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"nav_eur\"] = (\n",
    "    stocks_repaired[\"Value - AUM €\"] /\n",
    "    stocks_repaired[\"Quantity - AUM original\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"Value - AUM CCY\"] = (\n",
    "    stocks_repaired[\"Quantity - AUM\"] *\n",
    "    stocks_repaired[\"nav_ccy\"]\n",
    ")\n",
    "\n",
    "stocks_repaired[\"Value - AUM €\"] = (\n",
    "    stocks_repaired[\"Quantity - AUM\"] *\n",
    "    stocks_repaired[\"nav_eur\"]\n",
    ")\n",
    "\n",
    "# 7. Cleanup helper columns\n",
    "stocks_repaired = stocks_repaired.drop(\n",
    "    columns=[\n",
    "        \"Quantity - AUM repaired\",\n",
    "        \"Quantity - AUM original\",\n",
    "        \"nav_ccy\",\n",
    "        \"nav_eur\"\n",
    "    ]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# Sanity checks (CORRECT WAY)\n",
    "# ============================================================\n",
    "\n",
    "# Share of observations repaired\n",
    "repair_share = stocks_repaired[\"repair_flag\"].mean()\n",
    "\n",
    "# Ensure only repaired points were modified\n",
    "n_modified = stocks_repaired[\"repair_flag\"].sum()\n",
    "\n",
    "print(f\"Share of repaired observations: {repair_share:.4%}\")\n",
    "print(f\"Number of repaired rows: {n_modified:,}\")\n",
    "\n",
    "stocks_repaired.to_csv('AUM_repaired.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f262605-49e8-4304-b11e-38c8bcfc6e3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(stocks[\"Registrar Account - ID\"].nunique())\n",
    "print(df[\"Registrar Account - ID\"].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37e9b599-aa51-4e03-b23c-2dd24e77fe38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"AUM_repaired.csv\")\n",
    "\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5cfb4526-7435-4e4a-ae48-0a8d40e39d81",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/55327206.py:8: DtypeWarning:\n",
      "\n",
      "Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n",
      "/tmp/ipykernel_1311/55327206.py:9: DtypeWarning:\n",
      "\n",
      "Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged dataset size: (9033269, 6)\n",
      "\n",
      "NUMBER OF MODIFIED OBSERVATIONS: 2263602\n",
      "Share modified: 25.06 %\n",
      "\n",
      "NEGATIVE AUM\n",
      "Before repair: 34374\n",
      "After repair : 36320\n",
      "\n",
      "RAW AUM DISTRIBUTION\n",
      "count    9.033269e+06\n",
      "mean     9.106935e+03\n",
      "std      1.915018e+05\n",
      "min     -9.918641e+06\n",
      "25%      0.000000e+00\n",
      "50%      0.000000e+00\n",
      "75%      3.091340e+02\n",
      "max      4.256300e+07\n",
      "Name: Quantity - AUM_raw, dtype: float64\n",
      "\n",
      "REPAIRED AUM DISTRIBUTION\n",
      "count    9.033269e+06\n",
      "mean     9.104329e+03\n",
      "std      1.914988e+05\n",
      "min     -9.918641e+06\n",
      "25%      0.000000e+00\n",
      "50%      0.000000e+00\n",
      "75%      3.088430e+02\n",
      "max      4.256300e+07\n",
      "Name: Quantity - AUM_repaired, dtype: float64\n",
      "\n",
      "TOTAL AUM\n",
      "Raw total : 82265397351.45718\n",
      "Repaired total : 82241848877.5126\n",
      "\n",
      "TOP 20 AUM CHANGES\n",
      "        Registrar Account - ID Product - Isin Centralisation Date  \\\n",
      "8532368       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532369       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532370       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477988       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477987       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477986       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477989       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532371       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477994       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477996       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477997       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928641       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928642       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928643       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8928644       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8477995       OFF DISTRIBUTION   LU0992627611          2021-12-31   \n",
      "8532359       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8713983       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8713984       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "8532357       OFF DISTRIBUTION   LU0992627611          2021-11-30   \n",
      "\n",
      "         Quantity - AUM_raw  Quantity - AUM_repaired     aum_diff  \n",
      "8532368           41251.971              5298781.613  5257529.642  \n",
      "8532369           41251.971              5298781.613  5257529.642  \n",
      "8532370           41251.971              5298781.613  5257529.642  \n",
      "8477988         5298781.613                41251.971 -5257529.642  \n",
      "8477987         5298781.613                41251.971 -5257529.642  \n",
      "8477986         5298781.613                41251.971 -5257529.642  \n",
      "8477989         5298781.613                41251.971 -5257529.642  \n",
      "8532371           41251.971              5298781.613  5257529.642  \n",
      "8477994         5298781.613               128141.894 -5170639.719  \n",
      "8477996         5298781.613               128141.894 -5170639.719  \n",
      "8477997         5298781.613               128141.894 -5170639.719  \n",
      "8928641          128141.894              5298781.613  5170639.719  \n",
      "8928642          128141.894              5298781.613  5170639.719  \n",
      "8928643          128141.894              5298781.613  5170639.719  \n",
      "8928644          128141.894              5298781.613  5170639.719  \n",
      "8477995         5298781.613               128141.894 -5170639.719  \n",
      "8532359           41251.971              5059704.980  5018453.009  \n",
      "8713983         5059704.980                41251.971 -5018453.009  \n",
      "8713984         5059704.980                41251.971 -5018453.009  \n",
      "8532357           41251.971              5059704.980  5018453.009  \n",
      "\n",
      "ISIN WITH MOST MODIFICATIONS\n",
      "Product - Isin\n",
      "LU1623762769    0.535539\n",
      "LU2490324410    0.525588\n",
      "FR0013516044    0.524862\n",
      "LU2931971050    0.500000\n",
      "LU2931971217    0.500000\n",
      "FR001400TVB3    0.500000\n",
      "FR001400TU23    0.500000\n",
      "FR00140139F6    0.500000\n",
      "FR001400TVD9    0.500000\n",
      "LU2931971134    0.500000\n",
      "Name: aum_diff, dtype: float64\n",
      "\n",
      "REPAIR FLAG ERRORS: 2260454\n",
      "\n",
      "==============================\n",
      "COMPARISON COMPLETED\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# ============================================================\n",
    "# LOAD DATA\n",
    "# ============================================================\n",
    "\n",
    "aum_raw = pd.read_csv(\"stocks.csv\")          # fichier original\n",
    "aum_rep = pd.read_csv(\"AUM_repaired.csv\")    # fichier réparé\n",
    "\n",
    "aum_raw[\"Centralisation Date\"] = pd.to_datetime(aum_raw[\"Centralisation Date\"])\n",
    "aum_rep[\"Centralisation Date\"] = pd.to_datetime(aum_rep[\"Centralisation Date\"])\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# KEEP SAME KEYS\n",
    "# ============================================================\n",
    "\n",
    "keys = [\n",
    "    \"Registrar Account - ID\",\n",
    "    \"Product - Isin\",\n",
    "    \"Centralisation Date\"\n",
    "]\n",
    "\n",
    "aum_raw = aum_raw[keys + [\"Quantity - AUM\"]]\n",
    "aum_rep = aum_rep[keys + [\"Quantity - AUM\", \"repair_flag\"]]\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# MERGE DATASETS\n",
    "# ============================================================\n",
    "\n",
    "df = aum_raw.merge(\n",
    "    aum_rep,\n",
    "    on=keys,\n",
    "    how=\"inner\",\n",
    "    suffixes=(\"_raw\", \"_repaired\")\n",
    ")\n",
    "\n",
    "print(\"Merged dataset size:\", df.shape)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 1. HOW MANY VALUES CHANGED\n",
    "# ============================================================\n",
    "\n",
    "df[\"aum_diff\"] = df[\"Quantity - AUM_repaired\"] - df[\"Quantity - AUM_raw\"]\n",
    "\n",
    "n_changed = (df[\"aum_diff\"] != 0).sum()\n",
    "\n",
    "print(\"\\nNUMBER OF MODIFIED OBSERVATIONS:\", n_changed)\n",
    "print(\"Share modified:\", round(n_changed / len(df) * 100, 2), \"%\")\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 2. NEGATIVE AUM BEFORE / AFTER\n",
    "# ============================================================\n",
    "\n",
    "neg_before = (df[\"Quantity - AUM_raw\"] < 0).sum()\n",
    "neg_after = (df[\"Quantity - AUM_repaired\"] < 0).sum()\n",
    "\n",
    "print(\"\\nNEGATIVE AUM\")\n",
    "print(\"Before repair:\", neg_before)\n",
    "print(\"After repair :\", neg_after)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 3. DISTRIBUTION COMPARISON\n",
    "# ============================================================\n",
    "\n",
    "print(\"\\nRAW AUM DISTRIBUTION\")\n",
    "print(df[\"Quantity - AUM_raw\"].describe())\n",
    "\n",
    "print(\"\\nREPAIRED AUM DISTRIBUTION\")\n",
    "print(df[\"Quantity - AUM_repaired\"].describe())\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 4. TOTAL AUM COMPARISON\n",
    "# ============================================================\n",
    "\n",
    "print(\"\\nTOTAL AUM\")\n",
    "\n",
    "print(\"Raw total :\", df[\"Quantity - AUM_raw\"].sum())\n",
    "print(\"Repaired total :\", df[\"Quantity - AUM_repaired\"].sum())\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 5. LARGEST MODIFICATIONS\n",
    "# ============================================================\n",
    "\n",
    "largest_changes = df.sort_values(\n",
    "    \"aum_diff\",\n",
    "    key=lambda x: x.abs(),\n",
    "    ascending=False\n",
    ").head(20)\n",
    "\n",
    "print(\"\\nTOP 20 AUM CHANGES\")\n",
    "\n",
    "print(\n",
    "    largest_changes[\n",
    "        [\n",
    "            \"Registrar Account - ID\",\n",
    "            \"Product - Isin\",\n",
    "            \"Centralisation Date\",\n",
    "            \"Quantity - AUM_raw\",\n",
    "            \"Quantity - AUM_repaired\",\n",
    "            \"aum_diff\"\n",
    "        ]\n",
    "    ]\n",
    ")\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 6. WHICH ISIN WERE MOST MODIFIED\n",
    "# ============================================================\n",
    "\n",
    "isin_changes = (\n",
    "    df.groupby(\"Product - Isin\")[\"aum_diff\"]\n",
    "    .apply(lambda x: (x != 0).mean())\n",
    "    .sort_values(ascending=False)\n",
    "    .head(10)\n",
    ")\n",
    "\n",
    "print(\"\\nISIN WITH MOST MODIFICATIONS\")\n",
    "print(isin_changes)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 7. CHECK REPAIR FLAG CONSISTENCY\n",
    "# ============================================================\n",
    "\n",
    "if \"repair_flag\" in df.columns:\n",
    "\n",
    "    repair_flag_errors = (\n",
    "        (df[\"repair_flag\"] == False) &\n",
    "        (df[\"Quantity - AUM_raw\"] != df[\"Quantity - AUM_repaired\"])\n",
    "    ).sum()\n",
    "\n",
    "    print(\"\\nREPAIR FLAG ERRORS:\", repair_flag_errors)\n",
    "\n",
    "\n",
    "print(\"\\n==============================\")\n",
    "print(\"COMPARISON COMPLETED\")\n",
    "print(\"==============================\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "976dd82c-5c16-44e6-aa5d-65d085714b25",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1311/1498669893.py:8: DtypeWarning:\n",
      "\n",
      "Columns (2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# ============================================================\n",
    "# 1. LOAD DATA\n",
    "# ============================================================\n",
    "aum = pd.read_csv(\"AUM_repaired.csv\")\n",
    "\n",
    "flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
    "aum[\"Centralisation Date\"] = pd.to_datetime(aum[\"Centralisation Date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "66c011b5-aed1-428e-bd18-44d8d814c283",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "hole": 0.45,
         "hoverinfo": "label+percent",
         "labels": [
          "Clean / quasi-clean (≤1%)",
          "Moderate (1–10%)",
          "High (10–30%)",
          "Severe (>30%)"
         ],
         "textinfo": "percent",
         "type": "pie",
         "values": {
          "bdata": "mpmZmZlZR0BmZmZmZmY7QM3MzMzMzCpAAAAAAAAAKUA=",
          "dtype": "f8"
         }
        }
       ],
       "layout": {
        "legend": {
         "orientation": "h",
         "title": {
          "text": "Rupture ratio"
         },
         "x": 0.5,
         "xanchor": "center",
         "y": -0.15,
         "yanchor": "top"
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermap": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermap"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        },
        "title": {
         "text": "Rupture intensity distribution (AUM repaired)"
        }
       }
      },
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwsAAAFoCAYAAAAYWWdnAAAQAElEQVR4AeydB5wcZfnHn929S7lUkpAEAoHQOwYB6SR0kF5FRPhrKCIdaRE1IoYiXURAUBQRpagg0qUpIEV6b6ElJCEJqXe53O3uf3+TvJe5vd273bstM7PffPLezLzleZ/n+74z8z7zvjMbT/MPAhCAAAQgAAEIQAACEIBADgJx4x8EIBAhApgCAQhAAAIQgAAESkcAZ6F0LJEEAQhAAAIQKC0BpEEAAhCoMgGchSo3ANVDAAIQgAAEIAABCNQGgTBaibMQxlZDZwhAAAIQgAAEIAABCFSAAM5CBSBTRVgJoDcEIAABCEAAAhCobQI4C7Xd/lgPAQhAoHYIYCkEIAABCBRNAGehaGQUgAAEIAABCEAAAhCoNgHqrwwBnIXKcKYWCEAAAhCAAAQgAAEIhI4AzkLomiysCqM3BCAAAQhAAAIQgEDYCOAshK3F0BcCEIBAEAigAwQgAAEI1AQBnIWaaGaMhAAEIAABCEAAAvkJkAKBfARwFvKRIR4CEIAABCAAAQhAAAI1TgBnIZQdAKUhAAEIQAACEIAABCBQfgI4C+VnTA0QgAAEOidAKgQgAAEIQCCgBHAWAtowqAUBCEAAAhCAQDgJoDUEokQAZyFKrYktEIAABCAAAQhAAAIQKCEBnAUrIU1EQQACEIAABCAAAQhAIEIEcBYi1JiYAgEImBkQIAABCEAAAhAoGQGchZKhRBAEIAABCEAAAqUmgDwIQKC6BHAWqsuf2iEAAQhAAAIQgAAEIBBYAiV2FgJrJ4pBAAIQgAAEIAABCEAAAkUSwFkoEhjZIVBTBDAWAhCAAAQgAIGaJoCzUNPNj/EQgAAEIFBLBLAVAhCAQLEEcBaKJUZ+CEAAAhCAAAQgAAEIVJ9ARTTAWagIZiqBAAQgAAEIQAACEIBA+AjgLISvzdA4rATQGwIQgAAEIAABCISMAM5CyBoMdSEAAQhAIBgE0AICEIBALRDAWaiFVsZGCEAAAhCAAAQgAIHOCJCWhwDOQh4wREMAAhCAAAQgAAEIQKDWCeAs1HoPCKv96A0BCEAAAhCAAAQgUHYCOAtlR0wFEIAABCDQFQHSIQABCEAgmARwFoLZLmgFAQhAAAIQgAAEwkoAvSNEAGchQo2JKRCAAAQgAAEIQAACECglAZyFUtIMqyz0hgAEIAABCEAAAhCAQA4COAs5oBAFAQhAIMwE0B0CEIAABCBQKgI4C6UiiRwIQAACEIAABCBQegJIhEBVCeAsVBU/lQeZwE233Wd7fPMs+2L23MCoOfHC3wROp0rAeezpl2zDcUfbd0672BqbFleiSuooIwH1Y39b0r5lhI1oCEAAAj0kEHhnwd1ENFDwh633PsHeeOcjq8Q/DRoLrq8SCuWoQzff7uoojiorO3OIrniUBucapPsHE0HQMZdehcARV/GVDYXkd3lUTn1e54CLK8dW9eTTryf9qlS6ivvF19xm++2+rf32irOtoW+fDqLFVjaIl+zpkCETIVvUryQvc9juvxwQ9TcF7StR+ZS/M5mqS+nKp/wqF7YgvaW/3/ZK2zB+m7F2+/WT7K33PrYLrryl0tVTHwQgAAEIdEIg8M6C0/2ayafYG4/f3BbGbzvWDj1ukpV7IOXqZwsBCFSHwBU33OFVfNqxh3jbXH/+++KbXvQqK61oTz33WklnHyT4jn883mGGSYNsxSud0HMCG667uk04Ym977KmXKvYgqOdaIwECEIBA9AmExlnIbgoNHFZdebjdcsdDJR8YZNcVhuPJ5x5jz9x7remGGwZ9O9NxxaGD7YE/XZL3KXJnZcuZVmm9vnv4Xp5zPD7z1LWcdnUmu9r9SjMGGjwess84E/9cumomQA6C9wBh3/He0+kpn0zPlbVbcV8bu75X7p6Hnva27o87dukuPmxbcQ3K+bbvbtvYoIH97da/Phw2jOgbHgJoCgEIFEkgtM5CLjv1pE/T6Voa4E/XYEJT7FqG4OI1CNGyBeVV0FICF/yzFUq7/Prbbf7CRm8mIzuPZEq26nCytVU56SKddKytjpXf1S1Z/rIuj+Jd8OsiOfmC5Eq+ZCiPtjpWvGQ4edpKN+VRkC4TzrjEs092Kl1B5ZSuINukp+Jd8MtQHh2L53Mvve2tK3f59six5j+XPMlXvGT5dddxZzo627RVXn9QnPTQ1h+fa9/Vqfwu3P/os+2yujx+Nsog+a6M2ypOaeIirp31HzH66NPpbdzEUTarrNuXLH9wurj6/PyUT+VV1umhOAVXTrJ1rG0h+qmc8rugcq5ubbPrVz5xkm1vvvuR956F8inkyqv8uYKbMdhqsw1yJXtxz770linsuuPm5vK5cl6GHv4ZOXyIbbbx2qZZBMdBfVUOipwYpRdShcqKh7i49snm4fIo3oXsNlR5yemKq3QUayfHbRWnNKezq1NyXZzTT+2s+l1Zfx5/vNKlk2Q5GW7rZCmPgvrl/1591yW328pxEdMXX3uvw0xOu4wcQAACEIBAxQgE21noBMPMWXNt3vyFtu2WG+dcw9xJ0XZJGigpwi1x0nKnEyde1ba8SU93Tz/uUBvYv8FbU+vydfdp790PPmWXXfcX+9cdl3tPjd0abN14xx10qulG6erQGt6Jk39jumFLx+4E1ffwEy94dUmu7JPNqk/yNBNx42VnefbJTuVR0BNlpetGv/Mhp5sGRIpXePyuK72Bk3/goLwaEJ/yo6vtjOMP8+p7/v7rbOURQ+3sC65vm/3RIEV8lV/pTt60GbPzrlXuTEc91VXINcMku5WmoPryBdm471ETvQGh9HFhnTVWyVekLV4cZY+4unLiqHaT3EL6z6fTZtrh3zu/jVtXM0TKf/qka+2WX05s4yyFpIf4ar/QUIh+2bLU7ho4qx/IZrWj8hz43R93GOBl66oynbW15LggWzQgX3/t1WzM6JEuusNW7axZxo3WHePlU36VU/kOmbsZccSBu3rXm9ffmeJJkHMiO/Qk3Iso4o/OyZ5eA7K5ujbI7gP9+/dtd93Kly+f+rpWiK/aWcFdF9QH1Md1jVK8gq5dOo/U7508nR9aLqrlRcqjoGvfqJHDXJYO2zVWW8lkn2PdIQMREIAABCBQUQKhdBY0CNDNVtPV3blZ+wlrYKcBk4vTwFIh1+DT5enJNtdLmrJH9SnNr4sGyeO3HWsamOV6YleIHpLpbvDKL9sUNADQcVdBywE0+Drv1CPbsrqnf1oe4h8YyKGS4yG9lVkvoh55yG7tloVoeYheYvQ7eZL315vOt003WFPFigq56pAA6SX9/PUoPlfIZaPyrbn6KG06DeLoBqouo9rwqp+d7A673Kr8Pb+fXPASMuWXoyBuEu4YaACroLhyBQ3+NNg9+8TD25YFqf6LzzvOq9K9X+AdZP5k6yqdNags5MnxosbFpgG5HFXVkRHX4b/OC8nSk3/JVj71OXFQ6FCgmxHq0+Mz56LOU9WprexQncWK1DnpHhK4ssVeA7K55rJbcVdn+qF0d/UoTnx0DupcdPH5tro++q8fyufOrckTj2nXZ3Ut1jVZ55Py5bNJOqhNlSdXkNMn+z78+PNcycRBAAIQgECFCYTGWdATsw3HHW0KW+x5vOmGp3W23blZd8bY3cg0SNFgpbO8pUrTTVs3by2jyJapOM2gaCYlO60nx9Nnzml72p9PjgZFGojlGnC75R4zZ3+Zr3hbvGYcXL7hwwZ7a5L1xFIDT5dJ3A/bbyd3WNRWgwv/IEWFtQxFcRrA6DhfcDZq8CId8uXLFy+HQk9B/bMnyrvl2PXaDaQUV84wfOgK3uxQuQdYkq+BnJj77enX0MebRSqkX6lcMX1ajFUmV9DTZ/HXeeLSpZt0lCPn4kqx1eyCzlM5RNq6c6AUsktxDcjXBzQzqeumC7qW9kRnd26Js1+OrsVy2lwfcDZ11n7+8uyHkgBKQwACNUAgNM6CW+ahZQwaCOhTihrolaONKn1z00BaA2rdxN0N3W0VVw4bC5EpB0WDOg3snT5uq6UF0rkQOf48GlDoqbjaULY5efnWO/vL5tuXTD3llWO
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ============================================================\n",
    "# 2. PREPARE FLOWS\n",
    "# ============================================================\n",
    "\n",
    "flows_clean = (\n",
    "    flows\n",
    "    .groupby(\n",
    "        [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
    "        as_index=False\n",
    "    )[\"Quantity - NetFlows\"]\n",
    "    .sum()\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 3. MERGE\n",
    "# ============================================================\n",
    "\n",
    "df = aum.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# ============================================================\n",
    "# 4. SORT\n",
    "# ============================================================\n",
    "\n",
    "df = df.sort_values(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# REBUILD ACCOUNTING IDENTITY WITH REPAIRED AUM\n",
    "# ============================================================\n",
    "\n",
    "df[\"prev_aum\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "df[\"prev_flow\"] = df.groupby(\n",
    "    [\"Registrar Account - ID\",\"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
    "\n",
    "# ============================================================\n",
    "# COMPUTE GAP\n",
    "# ============================================================\n",
    "\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "EPS = 10\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_aum\"].notna()\n",
    "    & (df[\"gap_abs\"] > EPS)\n",
    ")\n",
    "# ============================================================\n",
    "# 6. COMPUTE GAP\n",
    "# ============================================================\n",
    "\n",
    "df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
    "df[\"gap_abs\"] = df[\"gap\"].abs()\n",
    "\n",
    "EPS = 10\n",
    "\n",
    "df[\"rupture_flag\"] = (\n",
    "    df[\"prev_aum\"].notna()\n",
    "    & (df[\"gap_abs\"] > EPS)\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 7. BUILD RUPTURE SUMMARY\n",
    "# ============================================================\n",
    "\n",
    "rupture_summary = (\n",
    "    df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
    "    .agg(\n",
    "        n_ruptures=(\"rupture_flag\",\"sum\"),\n",
    "        total_obs=(\"rupture_flag\",\"count\"),\n",
    "        rupture_ratio=(\"rupture_flag\",\"mean\")\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 8. SAME CLASSIFICATION AS YOUR CODE\n",
    "# ============================================================\n",
    "\n",
    "rs = rupture_summary.copy()\n",
    "\n",
    "bins = [0, 0.01, 0.10, 0.30, 1.01]\n",
    "\n",
    "labels = [\n",
    "    \"Clean / quasi-clean (≤1%)\",\n",
    "    \"Moderate (1–10%)\",\n",
    "    \"High (10–30%)\",\n",
    "    \"Severe (>30%)\"\n",
    "]\n",
    "\n",
    "rs[\"rupture_class\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=bins,\n",
    "    labels=labels,\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# ============================================================\n",
    "# 9. DISTRIBUTION\n",
    "# ============================================================\n",
    "\n",
    "dist = (\n",
    "    rs[\"rupture_class\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .sort_index()\n",
    "    * 100\n",
    ").round(1)\n",
    "\n",
    "# ============================================================\n",
    "# 10. DONUT CHART\n",
    "# ============================================================\n",
    "\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=dist.index,\n",
    "        values=dist.values,\n",
    "        hole=0.45,\n",
    "        textinfo=\"percent\",\n",
    "        hoverinfo=\"label+percent\"\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Rupture intensity distribution (AUM repaired)\",\n",
    "    legend=dict(\n",
    "        orientation=\"h\",\n",
    "        yanchor=\"top\",\n",
    "        y=-0.15,\n",
    "        xanchor=\"center\",\n",
    "        x=0.5\n",
    "    ),\n",
    "    legend_title_text=\"Rupture ratio\"\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "990898ea-ceca-46bb-bfb3-c87bf289d272",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter année / mois\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# 1. Nombre total de lignes par mois\n",
    "total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
    "\n",
    "# 2. Nombre de ruptures par mois\n",
    "ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
    "\n",
    "# 3. Merge pour obtenir total + ruptures\n",
    "ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
    "ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
    "\n",
    "# 4. Proportion (en %)\n",
    "ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
    "\n",
    "# 5. Pivot pour heatmap\n",
    "heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
    "\n",
    "# 6. Plot\n",
    "plt.figure(figsize=(14, 7))\n",
    "sns.heatmap(\n",
    "    heatmap_ratio, \n",
    "    cmap=\"Reds\",\n",
    "    linewidths=.3,\n",
    "    linecolor=\"grey\",\n",
    "    annot=True,\n",
    "    fmt=\".2%\",\n",
    "    cbar_kws={'label': 'Proportion de ruptures'}\n",
    ")\n",
    "\n",
    "plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
    "plt.xlabel(\"Mois\")\n",
    "plt.ylabel(\"Année\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d335589-c519-458d-857d-a051813b950b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter year / month au cas où\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# Merge géographique\n",
    "df = df.merge(\n",
    "    geo[[\"Registrar Account - ID\", \"country\"]],\n",
    "    on=\"Registrar Account - ID\",\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
    "\n",
    "# Total des lignes par pays\n",
    "total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
    "\n",
    "# Nombre de ruptures\n",
    "rupt_country = (\n",
    "    df[df[\"rupture_flag\"]]\n",
    "    .groupby(\"country\")\n",
    "    .size()\n",
    "    .reset_index(name=\"ruptures\")\n",
    ")\n",
    "\n",
    "# Merge + ratios\n",
    "country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
    "country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
    "country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
    "\n",
    "# Tri (rupture ratio décroissant)\n",
    "country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a45a111-25da-4f5c-9723-c3efd25c906d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# On ajoute une colonne en % pour l’affichage\n",
    "import plotly.express as px\n",
    "\n",
    "country_stats_plot = country_stats.copy()\n",
    "country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
    "\n",
    "# Tri décroissant par proportion de ruptures\n",
    "country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
    "\n",
    "fig = px.bar(\n",
    "    country_stats_plot,\n",
    "    x=\"country\",\n",
    "    y=\"rupture_ratio\",\n",
    "    hover_data={\n",
    "        \"rupture_pct\": ':.2f',\n",
    "        \"ruptures\": True,\n",
    "        \"total_obs\": True,\n",
    "        \"rupture_ratio\": False,  # on cache la version décimale\n",
    "    },\n",
    "    labels={\n",
    "        \"country\": \"Pays\",\n",
    "        \"rupture_ratio\": \"Proportion de ruptures\",\n",
    "        \"rupture_pct\": \"% de ruptures\",\n",
    "        \"ruptures\": \"Nb de ruptures\",\n",
    "        \"total_obs\": \"Nb d'observations\"\n",
    "    },\n",
    "    title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
    ")\n",
    "\n",
    "# Format en %\n",
    "fig.update_yaxes(tickformat=\".1%\")\n",
    "\n",
    "fig.update_layout(\n",
    "    xaxis_tickangle=-45,\n",
    "    bargap=0.2\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4af9841-6cf9-4d27-8096-ac878e866bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "rs = rupture_summary.copy()\n",
    "\n",
    "# 1. Stats numériques classiques\n",
    "print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
    "print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
    "\n",
    "\n",
    "# 2. Distribution par classes (bins)\n",
    "\n",
    "rs[\"rupture_bucket\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
    "    labels=[\n",
    "        \"0–0.1%\",\n",
    "        \"0.1–1%\",\n",
    "        \"1–5%\",\n",
    "        \"5–10%\",\n",
    "        \"10–25%\",\n",
    "        \"25–50%\",\n",
    "        \"50–100%\"\n",
    "    ],\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# Ajouter la catégorie \"0%\"\n",
    "rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
    "\n",
    "# Remplacer les 0% exacts\n",
    "rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
    "\n",
    "bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
    "print(bucket_counts)\n",
    "\n",
    "\n",
    "# 3. Pourcentages\n",
    "bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
    "\n",
    "print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
    "print(bucket_percent)\n",
    "\n",
    "\n",
    "# 4. Nombre de comptes totalement propres\n",
    "no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
    "print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
    "\n",
    "# 5. Comptes extrêmement problématiques\n",
    "severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
    "\n",
    "medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f39a9a5a-5f4e-4cac-9f63-e6952582b6ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "\n",
    "fig = px.histogram(\n",
    "    rs,\n",
    "    x=\"rupture_ratio\",\n",
    "    nbins=50,\n",
    "    title=\"Distribution du rupture_ratio\",\n",
    "    labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
    ")\n",
    "fig.update_layout(bargap=0.05)\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70132995-8379-44b6-8ff6-f09524c4e4d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. Filtres de base ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# Filtrer uniquement l'année 2021\n",
    "ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
    "\n",
    "print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
    "\n",
    "# --- 2. Classification du type de gap ---\n",
    "ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
    "\n",
    "# --- 3. Statistiques globales ---\n",
    "gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
    "gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
    "\n",
    "print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
    "print(gap_counts)\n",
    "print(\"\\n(%)\")\n",
    "print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
    "\n",
    "# --- 4. Intensité des écarts ---\n",
    "intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
    "print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
    "print(intensity_stats)\n",
    "\n",
    "# --- 5. Visualisation rapide ---\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
    "plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
    "plt.title(\"Distribution des gaps de rupture en 2021\")\n",
    "plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1faf943a-4703-4b19-a867-2670ac3a5209",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. ADD YEAR ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# --- 2. DEFINE PERIODS ---\n",
    "conditions = [\n",
    "    merged[\"year\"] < 2021,\n",
    "    merged[\"year\"] == 2021,\n",
    "    merged[\"year\"] > 2021\n",
    "]\n",
    "\n",
    "period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
    "\n",
    "merged[\"period\"] = np.select(\n",
    "    conditions,\n",
    "    period_labels,\n",
    "    default=\"unknown\"\n",
    ")\n",
    "\n",
    "# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
    "merged[\"gap_type\"] = np.where(\n",
    "    merged[\"gap\"] > 0, \"positive\",\n",
    "    np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 4. TOTAL OBS PER PERIOD ---\n",
    "total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
    "\n",
    "# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
    "rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
    "\n",
    "# --- 6. PROPORTION OF RUPTURES ---\n",
    "rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
    "\n",
    "# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
    "gap_dist = (\n",
    "    ruptures.groupby([\"period\", \"gap_type\"])\n",
    "    .size()\n",
    "    .groupby(level=0)\n",
    "    .apply(lambda x: (x / x.sum()) * 100)   # % par période\n",
    ")\n",
    "\n",
    "\n",
    "# --- 8. MERGE AND DISPLAY ---\n",
    "summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
    "summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
    "\n",
    "print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
    "print(summary)\n",
    "\n",
    "print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
    "print(gap_dist)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5abee764-b890-4ea1-8f98-5a0ff1512611",
   "metadata": {},
   "outputs": [],
   "source": [
    "from plotly.subplots import make_subplots\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. DEFINE PERIODS ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 2. Ensure gap_type exists + no missing categories ---\n",
    "ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"})  # zero is equivalent to no-flow change\n",
    "\n",
    "# --- 3. Compute gap counts ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 4. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 5. MAKE TWO DONUT CHARTS ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
    "    showlegend=True\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3aa3b8a0-f499-495a-9171-2e09d0bb1e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Compute gap counts by period ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 2. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before 2021\", \"After 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4f0dc74-649d-4105-9a1a-44a18d126a3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Define periods ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "# --- 2. Keep only ruptures ---\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 3. Count ruptures per period ---\n",
    "rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
    "    [\"Before Sep 2021\", \"After Sep 2021\"]\n",
    ").fillna(0)\n",
    "\n",
    "# --- 4. Pie chart ---\n",
    "fig = go.Figure(data=[\n",
    "    go.Pie(\n",
    "        labels=rupture_counts.index,\n",
    "        values=rupture_counts.values,\n",
    "        hole=0.45,\n",
    "        marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
    "        textinfo=\"percent+value\",\n",
    "    )\n",
    "])\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecccd73c-00a6-4ff3-b213-e85b98ec5a55",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# 1. Filtre sur la période post-Sept 2021\n",
    "cutoff = pd.Timestamp(\"2021-09-01\")\n",
    "post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
    "\n",
    "# 2. On ne garde que les ruptures\n",
    "post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# 3. Gap absolu + gap relatif (% du stock)\n",
    "post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
    "post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
    "\n",
    "# 4. Percentiles globaux\n",
    "p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
    "p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
    "p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
    "\n",
    "# 5. Classification automatique\n",
    "def classify_gap(gap, gap_rel, acct):\n",
    "    # RESET → énorme choc (technique)\n",
    "    if gap_abs >= p99 or gap_rel >= 0.90:\n",
    "        return \"reset\"\n",
    "\n",
    "    # SPIKE → très gros gap mais isolé\n",
    "    if gap_abs >= p95:\n",
    "        return \"spike\"\n",
    "\n",
    "    # SHIFT → décalage permanent\n",
    "    # Test : moyenne des gaps du compte\n",
    "    return None\n",
    "\n",
    "# Calcul du shift (décalage directionnel)\n",
    "shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
    "\n",
    "post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "post_rupt[\"gap_type2\"] = np.where(\n",
    "    post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
    "    np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
    "    np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
    " \n",
    "# 6. Statistiques globales\n",
    "stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
    "print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
    "print(stats)\n",
    "\n",
    "# 7. Stats par client\n",
    "client_stats = (\n",
    "    post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# 8. Stats par ISIN\n",
    "isin_stats = (\n",
    "    post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
    "print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
    "\n",
    "print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
    "print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2efc5e0-bc35-4fa7-ab5d-6be616964446",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- Data from your output ---\n",
    "labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
    "values = [50.4, 44.6, 4.0, 1.0]\n",
    "\n",
    "# --- Pie chart ---\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=labels,\n",
    "        values=values,\n",
    "        hole=0.35,                  # donut style (plus lisible)\n",
    "        textinfo='percent',\n",
    "        marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Typologie des ruptures depuis Septembre 2021\",\n",
    "    legend_title=\"Type de gap\",\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "744e04b6-3f34-40c9-95fe-a5605e7c7f02",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged[\"gap_abs\"] = merged[\"gap\"].abs()\n",
    "\n",
    "merged[\"gap_rel\"] = (\n",
    "    merged[\"gap_abs\"] /\n",
    "    merged[\"Quantity - AUM\"].replace(0, np.nan)\n",
    ")\n",
    "\n",
    "merged.loc[merged[\"rupture_flag\"], \"gap_rel\"].describe(\n",
    "    percentiles=[0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d20625e-1045-4b7a-ab64-3381997e4131",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uniquement sur les ruptures\n",
    "df_r = merged[merged[\"rupture_flag\"]].copy()\n",
    "\n",
    "# seuils globaux (descriptifs, pas \"optimisés\")\n",
    "q90 = df_r[\"gap_abs\"].quantile(0.90)\n",
    "q99 = df_r[\"gap_abs\"].quantile(0.99)\n",
    "\n",
    "# moyenne directionnelle par compte\n",
    "avg_gap_by_account = (\n",
    "    df_r.groupby(\"Registrar Account - ID\")[\"gap\"]\n",
    "    .mean()\n",
    "    .rename(\"avg_gap\")\n",
    ")\n",
    "\n",
    "df_r = df_r.merge(avg_gap_by_account, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "def classify_gap(row):\n",
    "    if row[\"gap_abs\"] >= q99:\n",
    "        return \"reset\"\n",
    "    if row[\"gap_abs\"] >= q90:\n",
    "        return \"spike\"\n",
    "    if abs(row[\"avg_gap\"]) > row[\"gap_abs\"]:\n",
    "        return \"shift\"\n",
    "    return \"micro\"\n",
    "\n",
    "df_r[\"discontinuity_type\"] = df_r.apply(classify_gap, axis=1)\n",
    "df_r[\"discontinuity_type\"].value_counts(normalize=True) * 100\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02806629-e454-4e10-82be-6e2239091088",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "yearly_stats = merged.groupby(\"year\").agg(\n",
    "    total_obs=(\"gap\", \"count\"),\n",
    "    ruptures=(\"rupture_flag\", \"sum\")\n",
    ").reset_index()\n",
    "\n",
    "yearly_stats[\"rupture_rate\"] = (\n",
    "    yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2edf2c55-45e7-4aad-b4f9-5c35178abad6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "df_r = merged[merged[\"rupture_flag\"]].copy()\n",
    "\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.hist(df_r[\"gap_abs\"], bins=100, log=True)\n",
    "plt.title(\"Distribution of absolute gaps (log scale)\")\n",
    "plt.xlabel(\"Absolute gap\")\n",
    "plt.ylabel(\"Frequency (log)\")\n",
    "plt.show()\n",
    "\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.hist(df_r[\"gap_rel\"].dropna(), bins=100, log=True)\n",
    "plt.title(\"Distribution of relative gaps (|gap| / AUM)\")\n",
    "plt.xlabel(\"Relative gap\")\n",
    "plt.ylabel(\"Frequency (log)\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "981f2ec6-574b-41ea-b4bf-45be54aeda1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,4))\n",
    "plt.plot(yearly_stats[\"year\"], yearly_stats[\"rupture_rate\"], marker=\"o\")\n",
    "plt.title(\"Evolution of AUM–Flow inconsistency rate over time\")\n",
    "plt.xlabel(\"Year\")\n",
    "plt.ylabel(\"Rupture rate\")\n",
    "plt.grid(True)\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}