2026-04-13 21:51:04 +02:00
25 changed files with 655861 additions and 208 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 data/
 data_exploration/
 *.csv
--- a/.ipynb_checkpoints/dataloader-checkpoint.ipynb
+++ b/.ipynb_checkpoints/dataloader-checkpoint.ipynb
@ -1,6 +0,0 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 Carmignac Project 
--- a/Readme.md
+++ b/Readme.md
@ -1 +0,0 @@
 #Carmignac Project 
--- a/dataloader.ipynb
+++ b/dataloader.ipynb
@ -1,201 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ff2261fb-9516-4410-b42d-3acc8dc1a460",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n",
    "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n",
    "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n",
    "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
    "fs = s3fs.S3FileSystem(\n",
    "    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
    "    key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
    "    secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
    "    token = os.environ[\"AWS_SESSION_TOKEN\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "dc546698-76dc-4eaf-b9e2-7602953bf8f5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Morningstar Global Asset Class</th>\n",
       "      <th>Morningstar Global Category</th>\n",
       "      <th>Morningstar Category</th>\n",
       "      <th>Combined Country</th>\n",
       "      <th>Combined Channel Type</th>\n",
       "      <th>Combined Type</th>\n",
       "      <th>Month/Year (Record Date)</th>\n",
       "      <th>Combined Net Assets</th>\n",
       "      <th>Combined Net Sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Domestic</td>\n",
       "      <td>Jan 2015</td>\n",
       "      <td>11170000.0</td>\n",
       "      <td>799301.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Domestic</td>\n",
       "      <td>Feb 2015</td>\n",
       "      <td>21210000.0</td>\n",
       "      <td>8922456.46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Domestic</td>\n",
       "      <td>Mar 2015</td>\n",
       "      <td>23670000.0</td>\n",
       "      <td>1718627.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Domestic</td>\n",
       "      <td>Apr 2015</td>\n",
       "      <td>22720000.0</td>\n",
       "      <td>-670097.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Domestic</td>\n",
       "      <td>May 2015</td>\n",
       "      <td>23550000.0</td>\n",
       "      <td>204625.93</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Morningstar Global Asset Class Morningstar Global Category  \\\n",
       "0                            NaN                         NaN   \n",
       "1                            NaN                         NaN   \n",
       "2                            NaN                         NaN   \n",
       "3                            NaN                         NaN   \n",
       "4                            NaN                         NaN   \n",
       "\n",
       "  Morningstar Category Combined Country Combined Channel Type Combined Type  \\\n",
       "0                  NaN       Luxembourg           Proprietary      Domestic   \n",
       "1                  NaN       Luxembourg           Proprietary      Domestic   \n",
       "2                  NaN       Luxembourg           Proprietary      Domestic   \n",
       "3                  NaN       Luxembourg           Proprietary      Domestic   \n",
       "4                  NaN       Luxembourg           Proprietary      Domestic   \n",
       "\n",
       "  Month/Year (Record Date)  Combined Net Assets  Combined Net Sales  \n",
       "0                 Jan 2015           11170000.0           799301.47  \n",
       "1                 Feb 2015           21210000.0          8922456.46  \n",
       "2                 Mar 2015           23670000.0          1718627.81  \n",
       "3                 Apr 2015           22720000.0          -670097.35  \n",
       "4                 May 2015           23550000.0           204625.93  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with fs.open('projet-bdc-data/carmignac/Data Modélisation/market data/broadridge_Global Market data MS.csv', 'rb') as f:\n",
    "    df = pd.read_csv(f)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7494bffd-83b5-42e2-b17e-d04c90f3b59e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/dataset_features.csv
+++ b/dataset_features.csv
--- a/notebooks/aum_flows_analysis.ipynb
+++ b/notebooks/aum_flows_analysis.ipynb
--- a/notebooks/competitors_analysis.ipynb
+++ b/notebooks/competitors_analysis.ipynb
--- a/notebooks/dataloader.ipynb
+++ b/notebooks/dataloader.ipynb
--- a/notebooks/push_s3.ipynb
+++ b/notebooks/push_s3.ipynb
@ -0,0 +1,70 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d2701d07",
   "metadata": {},
   "source": [
    "# Helper notebook to allow pushing data on S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5c8fc6c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "\n",
    "def push_file(local_path, s3_path):\n",
    "    fs = s3fs.S3FileSystem(\n",
    "        client_kwargs={'endpoint_url': 'https://' + 'minio-simple.lab.groupe-genes.fr'},\n",
    "        key=os.environ[\"AWS_ACCESS_KEY_ID\"],\n",
    "        secret=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n",
    "        token=os.environ[\"AWS_SESSION_TOKEN\"]\n",
    "    )\n",
    "\n",
    "    with open(local_path, 'rb') as local_f, fs.open(s3_path, 'wb') as s3_f:\n",
    "        s3_f.write(local_f.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d43b725e",
   "metadata": {},
   "outputs": [],
   "source": [
    "push_file('repair_challenge/alpha_5%/carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n",
    "push_file('repair_challenge/alpha_5%/carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n",
    "push_file('repair_challenge/alpha_5%/carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')\n",
    "push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')\n",
    "push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')\n",
    "push_file('AUM_repair_audit.csv', 'projet-bdc-carmignac-g3//paco/AUM_repair_audit.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/peers_summary.csv
+++ b/peers_summary.csv
@ -0,0 +1,28 @@
 strategy,n_carmignac_sc,n_competitors,n_index_funds,ms_categories,broad_category
 CAD,2,27,2,"EAA Fund Asia ex-Japan Equity, EAA Fund Asia ex-Japan Small/Mid-Cap Equity, EAA Fund Asia-Pacific Equity, EAA Fund Asia-Pacific ex-Japan Equity, EAA Fund Global Emerging Markets ex-China Equity",Equity
 CARE,2,22,0,"EAA Fund Equity Market Neutral EUR, EAA Fund Long/Short Equity - Global, EAA Fund Long/Short Equity - Europe, EAA Fund Macro Trading EUR",Alternative
 CCNE,1,28,0,"EAA Fund Greater China Equity, EAA Fund China Equity, EAA Fund China Equity - A Shares",Equity
 CCR,3,36,1,"EAA Fund EUR Corporate Bond, EAA Fund EUR Flexible Bond, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund EUR High Yield Bond, EAA Fund Global Corporate Bond - EUR Hedged",Fixed Income
 CEMD,1,34,0,"EAA Fund Global Emerging Markets Bond, EAA Fund Global Emerging Markets Bond - EUR Hedged, EAA Fund Other Bond, EAA Fund Global Emerging Markets Bond - Local Currency, Global Emerging Markets Bond, Global Emerging Markets Bond - EUR Hedged, Global Emerging Markets Bond - Local Currency",Fixed Income
 CEMP,2,11,0,"EAA Fund Global Emerging Markets Allocation, EAA Fund Other Allocation, EAA Fund Asia Allocation, EAA Fund Greater China Allocation, Global Emerging Markets Allocation",Allocation
 CE,3,40,1,"EAA Fund Global Emerging Markets Equity, Global Emerging Markets Equity",Equity
 CFB,2,20,1,"EAA Fund EUR Flexible Bond, EAA Fund EUR Diversified Bond, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund Global Diversified Bond - EUR Hedged, EUR Flexible Bond",Fixed Income
 CFG,1,10,0,"EAA Fund Europe ex-UK Small/Mid-Cap Equity, EAA Fund Europe Flex-Cap Equity, EAA Fund Europe Mid-Cap Equity, EAA Fund Europe Small-Cap Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Eurozone Mid-Cap Equity, EAA Fund Global Flex-Cap Equity, EAA Fund Global Large-Cap Growth Equity",Equity
 CGB,2,35,2,"EAA Fund Global Diversified Bond, EAA Fund Global Flexible Bond - EUR Hedged, Global Diversified Bond, EAA Fund Global Flexible Bond, EAA Fund Other Bond, EAA Fund EUR Diversified Bond - Short Term, EAA Fund EUR Flexible Bond, EAA Fund Global Government Bond, EAA Fund Global Corporate Bond - EUR Hedged, EAA Fund Global Diversified Bond - EUR Hedged, EAA Fund Global Government Bond - EUR Hedged",Fixed Income
 CGC,1,22,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Other Equity, EAA Fund Global Large-Cap Blend Equity",Equity
 CGE,2,52,0,"EAA Fund Europe Large-Cap Blend Equity, EAA Fund Europe Large-Cap Growth Equity, EAA Fund Europe Large-Cap Value Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Europe Flex-Cap Equity, EAA Fund Europe Equity Income, Europe Large-Cap Growth Equity",Equity
 CHX,1,10,0,"EAA Fund Europe Large-Cap Blend Equity, EAA Fund Europe Mid-Cap Equity, EAA Fund Eurozone Flex-Cap Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Global Large-Cap Blend Equity, EAA Fund Global Large-Cap Growth Equity, EAA Fund Other Equity, EAA Fund Sector Equity Consumer Goods & Services, EAA Fund Sector Equity Ecology",Equity
 CIL,2,12,0,"EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Flexible Allocation, EAA Fund EUR Moderate Allocation - Global, EAA Fund EUR Cautious Allocation - Global, EUR Flexible Allocation - Global",Allocation
 CI,3,28,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Global Large-Cap Value Equity, EAA Fund Global Large-Cap Blend Equity, EAA Fund Other Equity, EAA Fund Global Equity Income, EAA Fund Global Flex-Cap Equity, EAA Fund Europe Flex-Cap Equity",Equity
 CMAP,1,21,0,"EAA Fund Event Driven, EAA Fund Relative Value Arbitrage",Alternative
 CMA,1,4,0,EAA Fund Event Driven,Alternative
 CPE,2,19,0,"EAA Fund EUR Moderate Allocation, EAA Fund EUR Cautious Allocation, EAA Fund EUR Flexible Allocation, EAA Fund EUR Aggressive Allocation, EAA Fund EUR Moderate Allocation - Global, EUR Moderate Allocation",Allocation
 CPI,2,18,0,"EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Moderate Allocation - Global, EAA Fund EUR Flexible Allocation, EAA Fund EUR Cautious Allocation - Global, EAA Fund Other Allocation, EAA Fund USD Moderate Allocation, EAA Fund EUR Cautious Allocation, EAA Fund Macro Trading EUR, EAA Fund GBP Flexible Allocation, EAA Fund Global Inflation-Linked Bond - EUR Hedged, EAA Fund Commodities - Broad Basket",Allocation
 CP,2,34,0,"EAA Fund EUR Moderate Allocation - Global, EAA Fund USD Moderate Allocation, EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Cautious Allocation - Global, EAA Fund EUR Aggressive Allocation - Global, EAA Fund EUR Cautious Allocation, EAA Fund EUR Flexible Allocation, EAA Fund EUR Diversified Bond, EAA Fund EUR Moderate Allocation, EUR Moderate Allocation - Global",Allocation
 CS,2,27,2,"EAA Fund EUR Diversified Bond - Short Term, EAA Fund EUR Government Bond - Short Term, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund EUR Ultra Short-Term Bond, EAA Fund EUR Flexible Bond, EAA Fund EUR Corporate Bond - Short Term, EAA Fund EUR Diversified Bond, EAA Fund EUR Corporate Bond",Fixed Income
 CTS,2,24,0,"EAA Fund Sector Equity Technology, EAA Fund US Flex-Cap Equity, Sector Equity Technology",Equity
 PLSEE,2,27,0,"EAA Fund Long/Short Equity - Global, EAA Fund Equity Market Neutral EUR, EAA Fund Long/Short Equity - Europe, EAA Fund Long/Short Equity - Other, EAA Fund Europe Large-Cap Blend Equity",Equity
 UKCEL,2,27,0,"EAA Fund Europe ex-UK Equity, EAA Fund Europe ex-UK Small/Mid-Cap Equity, EAA Fund Other Equity, EAA Fund Europe Large-Cap Blend Equity",Equity
 UKCE,2,21,0,EAA Fund Global Emerging Markets Equity,Equity
 UKCGB,5,26,0,"EAA Fund Global Flexible Bond - GBP Hedged, EAA Fund Global Flexible Bond, EAA Fund Global Diversified Bond, EAA Fund Global Diversified Bond - GBP Hedged, EAA Fund GBP Allocation 0-20% Equity",Fixed Income
 UKCGEC,3,17,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Global Large-Cap Blend Equity",Equity
--- a/relative_performance.csv
+++ b/relative_performance.csv
--- a/src/pycache/feature_engineering.cpython-313.pyc
+++ b/src/pycache/feature_engineering.cpython-313.pyc
--- a/src/pycache/peers_loader.cpython-313.pyc
+++ b/src/pycache/peers_loader.cpython-313.pyc
--- a/src/pycache/predictive_model.cpython-313.pyc
+++ b/src/pycache/predictive_model.cpython-313.pyc
--- a/src/pycache/relative_performance.cpython-313.pyc
+++ b/src/pycache/relative_performance.cpython-313.pyc
--- a/src/repair_challenge/carmignac_analysis.py
+++ b/src/repair_challenge/carmignac_analysis.py
@ -0,0 +1,328 @@
 """
 Pipeline Results Analysis
 =====================================================
 Analyses the CSV outputs produced by carmignac_repair.py:
  - carmignac_scores.csv      (post-surgery score history)
  - carmignac_mapping.csv     (reg_id mapping history)
  - carmignac_surgery_log.csv (surgery operations)
 Produces a self-contained HTML report with interactive charts.
 Usage:
    python carmignac_analysis.py
    python carmignac_analysis.py --scores path/to/scores.csv \
                                  --mapping path/to/mapping.csv \
                                  --surgery path/to/surgery_log.csv \
                                  --out report.html
 """
 import argparse
 import os
 import sys
 import numpy as np
 import pandas as pd
 from helpers import build_html_repair
 # ─────────────────────────────────────────────────────────────
 # 1. LOAD & VALIDATE
 # ─────────────────────────────────────────────────────────────
 def load_outputs(
    scores_path, mapping_path, surgery_path, err_isin_path=None, err_agg_path=None
 ):
    scores = pd.read_csv(scores_path, parse_dates=["date"])
    mapping = pd.read_csv(mapping_path, parse_dates=["date"])
    surgery = pd.read_csv(surgery_path, parse_dates=["date"])
    # Normalise dtypes
    scores["reg_id"] = scores["reg_id"].astype(str)
    mapping["reg_orig"] = mapping["reg_orig"].astype(str)
    mapping["reg_used"] = mapping["reg_used"].astype(str)
    mapping["changed"] = mapping["changed"].astype(bool)
    surgery["reg_orig"] = surgery["reg_orig"].astype(str)
    surgery["reg_from"] = surgery["reg_from"].astype(str)
    surgery["reg_to"] = surgery["reg_to"].astype(str)
    if "lookback_months" not in surgery.columns:
        surgery["lookback_months"] = 1  # backwards compat
    # Error account (optional)
    err_isin = None
    err_agg = None
    if err_isin_path and os.path.exists(err_isin_path):
        err_isin = pd.read_csv(err_isin_path, parse_dates=["date"])
        err_isin["isin"] = err_isin["isin"].astype(str)
    if err_agg_path and os.path.exists(err_agg_path):
        err_agg = pd.read_csv(err_agg_path, parse_dates=["date"])
    return scores, mapping, surgery, err_isin, err_agg
 # ─────────────────────────────────────────────────────────────
 # LOAD ERROR ACCOUNT (optional)
 # ─────────────────────────────────────────────────────────────
 def load_error_account(isin_path, agg_path):
    """
    Loads the error account CSVs produced by carmignac_diagnostics.py.
    Returns (df_err_isin, df_err_agg) or (None, None) if files not found.
    """
    if not isin_path or not agg_path:
        return None, None
    try:
        ei = pd.read_csv(isin_path, parse_dates=["date"])
        ea = pd.read_csv(agg_path, parse_dates=["date"])
        ei["isin"] = ei["isin"].astype(str)
        print(
            f"[Load] error account (ISIN) : {len(ei)} rows, "
            f"{ei['isin'].nunique()} ISINs"
        )
        print(f"[Load] error account (agg)  : {len(ea)} rows")
        return ei, ea
    except Exception as e:
        print(f"[WARN] Could not load error account: {e}")
        return None, None
 # ─────────────────────────────────────────────────────────────
 # 2. COMPUTE ANALYTICS
 # ─────────────────────────────────────────────────────────────
 def compute_analytics(scores, mapping, surgery):
    dates = sorted(scores["date"].unique())
    # ── 2.1  Sum of scores per date (post-surgery) ──────────────
    sum_post = scores.groupby("date")["score"].sum().reindex(dates).rename("sum_post")
    # ── 2.2  Reconstruct pre-surgery (counterfactual) ───────────
    # Without surgery, every reg_id that had a hard break would score 0
    # from that date backwards.  We propagate the surgery "gain" as a
    # cumulative deficit going back in time.
    gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum()
    # cumulative deficit = sum of gains for all surgeries at or after date t
    cumulative_deficit = pd.Series(0.0, index=dates)
    for d in dates:
        cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum()
    sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre")
    timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre})
    timeline.index = pd.to_datetime(timeline.index)
    timeline["recovery_pct"] = np.where(
        sum_pre < sum_post,
        (sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100,
        0.0,
    )
    # ── 2.3  Per-date surgery stats ─────────────────────────────
    surgery_stats = (
        surgery.groupby("date")
        .agg(
            n_surgeries=("reg_orig", "count"),
            total_gain=("gain_vs_no_surgery", "sum"),
            avg_gain=("gain_vs_no_surgery", "mean"),
            avg_jaccard=("jaccard_composite", "mean"),
            avg_score_before=("score_before", "mean"),
            avg_score_after=("score_after", "mean"),
        )
        .reindex(dates, fill_value=0)
    )
    # ── 2.4  Score distribution over time ───────────────────────
    # Wide format: rows=dates, cols=reg_ids
    pivot = scores.pivot_table(
        index="date", columns="reg_id", values="score", aggfunc="last"
    )
    pivot = pivot.reindex(dates)
    pivot.index = pd.to_datetime(pivot.index)
    # ── 2.5  Mapping churn ──────────────────────────────────────
    # For each date, how many reg_ids are remapped (not using their original code)?
    churn = (
        mapping.groupby("date")["changed"]
        .sum()
        .reindex(dates, fill_value=0)
        .rename("n_remapped")
    )
    # ── 2.6  Score entropy (distribution spread) ────────────────
    def entropy(row):
        p = row.dropna()
        p = p[p > 0]
        if len(p) == 0:
            return np.nan
        p = p / p.sum()
        return -(p * np.log(p)).sum()
    timeline["entropy"] = pivot.apply(entropy, axis=1).values
    # ── 2.7  Individual score trajectories ──────────────────────
    # Identify which reg_ids were ever remapped
    ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique())
    # ── 2.8  Surgery detail table ───────────────────────────────
    surgery_detail = surgery.copy()
    surgery_detail["gain_pct_of_score"] = (
        surgery_detail["gain_vs_no_surgery"]
        / surgery_detail["score_before"].clip(lower=1e-9)
        * 100
    ).round(2)
    return {
        "timeline": timeline,
        "surgery_stats": surgery_stats,
        "pivot": pivot,
        "churn": churn,
        "ever_remapped": ever_remapped,
        "surgery_detail": surgery_detail,
        "dates": [d.strftime("%Y-%m-%d") for d in dates],
    }
 # ─────────────────────────────────────────────────────────────
 # 3. PRINT CONSOLE SUMMARY
 # ─────────────────────────────────────────────────────────────
 def print_summary(analytics, surgery):
    tl = analytics["timeline"]
    ss = analytics["surgery_stats"]
    print("\n" + "=" * 65)
    print("  CARMIGNAC PIPELINE — RESULTS SUMMARY")
    print("=" * 65)
    print(f"\n  Date range   : {tl.index.min().date()} → {tl.index.max().date()}")
    print(f"  Total months : {len(tl)}")
    print(f"  Reg IDs      : {analytics['pivot'].shape[1]}")
    print("\n  ── Score (Σ) ──────────────────────────────────────────")
    print(f"  At t_ref (latest)  : {tl['sum_post'].iloc[-1]:.6f}")
    print(f"  At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}")
    print(
        f"  Min (post-surgery) : {tl['sum_post'].min():.6f}  "
        f"({tl['sum_post'].idxmin().date()})"
    )
    print(
        f"  Min (pre-surgery)  : {tl['sum_pre'].min():.6f}  "
        f"({tl['sum_pre'].idxmin().date()})"
    )
    print(f"  Max recovery (pct) : {tl['recovery_pct'].max():.2f}%")
    print("\n  ── Surgeries ─────────────────────────────────────────")
    if len(surgery) == 0:
        print("  No surgeries performed.")
    else:
        print(f"  Total operations   : {len(surgery)}")
        print(f"  Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}")
        print(f"  Avg Jaccard        : {surgery['jaccard_composite'].mean():.4f}")
        print(f"  Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}")
        print()
        print(
            f"  {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} "
            f"{'Jaccard':>8s} {'Gain':>10s}"
        )
        print("  " + "-" * 78)
        for _, row in surgery.sort_values("date").iterrows():
            print(
                f"  {str(row['date'].date()):12s} {row['reg_orig']:12s} "
                f"{row['reg_from']:15s} {row['reg_to']:15s} "
                f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}"
            )
    print("\n  ── Mapping churn ─────────────────────────────────────")
    ch = analytics["churn"]
    print(
        f"  Max remapped at one date  : {int(ch.max())}  ({ch.idxmax().date() if ch.max() > 0 else 'N/A'})"
    )
    print(f"  Reg IDs ever remapped     : {len(analytics['ever_remapped'])}")
    print("\n  ── Score entropy (distribution spread) ───────────────")
    ent = analytics["timeline"]["entropy"]
    print(f"  Mean entropy : {ent.mean():.4f}")
    print(f"  Std  entropy : {ent.std():.4f}")
    print()
 # ─────────────────────────────────────────────────────────────
 # MAIN
 # ─────────────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser")
    parser.add_argument("--scores", default="repair_results/carmignac_scores.csv")
    parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
    parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
    parser.add_argument("--out", default="repair_results/carmignac_report.html")
    parser.add_argument(
        "--error-account-isin",
        default=None,
        dest="error_isin",
        help="Path to carmignac_error_account.csv (optional)",
    )
    parser.add_argument(
        "--error-account-agg",
        default=None,
        dest="error_agg",
        help="Path to carmignac_error_account_agg.csv (optional)",
    )
    args = parser.parse_args()
    # Resolve paths relative to this script's directory if files not found
    base = os.path.dirname(os.path.abspath(__file__))
    def resolve(p, required=True):
        if p is None:
            return None
        if os.path.exists(p):
            return p
        alt = os.path.join(base, p)
        if os.path.exists(alt):
            return alt
        if required:
            sys.exit(f"[ERROR] File not found: {p}")
        print(f"[WARN] Optional file not found: {p}")
        return None
    scores_path = resolve(args.scores)
    mapping_path = resolve(args.mapping)
    surgery_path = resolve(args.surgery)
    error_isin_path = resolve(args.error_isin, required=False)
    error_agg_path = resolve(args.error_agg, required=False)
    print(f"[Load] scores  : {scores_path}")
    print(f"[Load] mapping : {mapping_path}")
    print(f"[Load] surgery : {surgery_path}")
    scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs(
        scores_path,
        mapping_path,
        surgery_path,
        err_isin_path=error_isin_path,
        err_agg_path=error_agg_path,
    )
    analytics = compute_analytics(scores, mapping, surgery)
    print_summary(analytics, surgery)
    html = build_html_repair(
        analytics,
        surgery,
        scores,
        mapping,
        df_err_isin=df_err_isin,
        df_err_agg=df_err_agg,
    )
    out_path = "../" + args.out
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"\n[Report] Written to → {out_path}")
 if __name__ == "__main__":
    main()
--- a/src/repair_challenge/carmignac_branch.py
+++ b/src/repair_challenge/carmignac_branch.py
@ -0,0 +1,410 @@
 """
 AUM Branching / Repair
 ==================================================
 Takes as input:
  - The original AUM file (pre-repair)
  - The mapping CSV produced by carmignac_repair.py
  - (Optionally) the surgery log, for audit annotation
 Produces:
  - A repaired AUM file where every Registrar Account ID is replaced
    by its canonical identity (reg_orig) as determined by the pipeline.
 Core logic
 ----------
 The mapping table encodes, for every (date, reg_orig) pair, which
 physical code (reg_used) was actually present in the data at that date.
  reg_orig  = the stable canonical identity  (output label)
  reg_used  = the code that appeared in the raw data at that date
 For rows where reg_used != reg_orig (changed=True), the raw code is a
 historical alias that the surgery pass identified as belonging to
 reg_orig.  The repair simply relabels those rows.
 For accounts not in the repair universe (below the AUM threshold, or
 excluded categories), rows are passed through unchanged.
 Self-mapped surgeries (reg_from == reg_to in the surgery log) do not
 require any relabelling — they signal a data quality issue on that
 month, not a code change.
 Usage
 -----
    python carmignac_branch.py                          # default paths
    python carmignac_branch.py \\
        --aum      raw_AUM.csv \\
        --mapping  carmignac_mapping.csv \\
        --surgery  carmignac_surgery_log.csv \\
        --out      AUM_repaired.csv
 """
 import argparse
 import os
 import sys
 import pandas as pd
 from helpers import load_inputs_branch
 # ─────────────────────────────────────────────────────────────
 # BUILD RENAME LOOKUP
 # ─────────────────────────────────────────────────────────────
 def build_rename_lookup(mapping):
    """
    Returns a dict  {(date, reg_used) -> reg_orig}
    restricted to rows where reg_used != reg_orig (actual changes).
    For self-mapped surgeries or stable accounts, no entry is needed.
    """
    changed = mapping[mapping["changed"] & (mapping["reg_orig"] != mapping["reg_used"])]
    lookup = {}
    for _, row in changed.iterrows():
        key = (row["date"], row["reg_used"])
        if key in lookup and lookup[key] != row["reg_orig"]:
            print(
                f"  [WARN] Conflicting mapping at {row['date'].date()} "
                f"reg_used={row['reg_used']}: "
                f"{lookup[key]} vs {row['reg_orig']} — keeping first"
            )
        else:
            lookup[key] = row["reg_orig"]
    return lookup
 # ─────────────────────────────────────────────────────────────
 # BRANCHING
 # ─────────────────────────────────────────────────────────────
 def apply_branching(aum, lookup):
    """
    Renames Registrar Account - ID in the AUM dataframe according to
    the lookup {(date, reg_used) -> reg_orig}.
    Rows not in the lookup are left untouched.
    Returns:
      - repaired  : the full AUM dataframe with corrected IDs
      - audit     : a subset showing only the renamed rows, with both
                    the original and canonical IDs for verification
    """
    aum = aum.copy()
    aum["Centralisation Date"] = pd.to_datetime(aum["Centralisation Date"])
    aum["_date_key"] = aum["Centralisation Date"]
    aum["_reg_key"] = aum["Registrar Account - ID"].astype(str)
    # Vectorised lookup via merge
    lookup_df = pd.DataFrame(
        [(d, reg_used, reg_orig) for (d, reg_used), reg_orig in lookup.items()],
        columns=["_date_key", "_reg_key", "_canonical_id"],
    )
    merged = aum.merge(lookup_df, on=["_date_key", "_reg_key"], how="left")
    # Audit: rows that were actually renamed
    renamed_mask = merged["_canonical_id"].notna()
    audit = merged[renamed_mask].copy()
    audit["original_reg_id"] = audit["_reg_key"]
    audit["canonical_reg_id"] = audit["_canonical_id"]
    audit = audit[
        [
            "Centralisation Date",
            "original_reg_id",
            "canonical_reg_id",
            "Product - Isin",
            "Quantity - AUM",
            "Value - AUM €",
        ]
    ]
    # Rename
    merged.loc[renamed_mask, "Registrar Account - ID"] = merged.loc[
        renamed_mask, "_canonical_id"
    ]
    # Drop helper columns
    repaired = merged.drop(columns=["_date_key", "_reg_key", "_canonical_id"])
    return repaired, audit
 # ─────────────────────────────────────────────────────────────
 # CONSISTENCY CHECK
 # ─────────────────────────────────────────────────────────────
 def consistency_check(original, repaired, mapping, surgery):
    """
    Sanity checks after branching:
    1. Row count preserved
    2. No reg_used alias remains in the repaired file (for changed entries)
    3. For each (reg_orig, isin, date) there is at most one row
       (branching should not create duplicates)
    4. Summary of surgery operations applied
    """
    print("\n[Consistency checks]")
    # Row count
    if len(original) == len(repaired):
        print(f"  ✓ Row count preserved : {len(repaired)}")
    else:
        print(f"  ✗ Row count changed   : {len(original)} → {len(repaired)}")
    # Aliases eliminated
    changed = mapping[mapping["changed"] & (mapping["reg_orig"] != mapping["reg_used"])]
    aliases = set(changed["reg_used"].unique())
    still_present = set(repaired["Registrar Account - ID"].astype(str)) & aliases
    if not still_present:
        print(f"  ✓ All {len(aliases)} aliased code(s) successfully relabelled")
    else:
        print(
            f"  ✗ {len(still_present)} aliased code(s) still present: {still_present}"
        )
    # Duplicates
    key_cols = ["Registrar Account - ID", "Product - Isin", "Centralisation Date"]
    dup_count = repaired.duplicated(subset=key_cols).sum()
    if dup_count == 0:
        print("  ✓ No duplicate (reg_id, isin, date) keys")
    else:
        print(
            f"  ✗ {dup_count} duplicate (reg_id, isin, date) rows found — inspect manually"
        )
        print(
            repaired[repaired.duplicated(subset=key_cols, keep=False)][
                key_cols + ["Quantity - AUM"]
            ]
            .head(10)
            .to_string(index=False)
        )
    # Surgery summary
    if not surgery.empty:
        print("\n[Surgery operations applied]")
        for _, op in surgery.sort_values("date").iterrows():
            self_map = (
                " [self-map — data quality flag, no rename]"
                if op["reg_from"] == op["reg_to"]
                else ""
            )
            print(
                f"  {op['date'].date()} | {op['reg_orig']} : "
                f"{op['reg_from']} → {op['reg_to']}"
                f"  (Jaccard={op['jaccard_composite']:.4f}, "
                f"gain={op['gain_vs_no_surgery']:.6f}){self_map}"
            )
 # ─────────────────────────────────────────────────────────────
 # EXPORT PATHS (branched accounts only)
 # ─────────────────────────────────────────────────────────────
 def export_paths(aum, mapping, surgery, repaired):
    """
    Builds a condensed AUM file for ALL accounts in the repair universe
    (i.e. every reg_orig present in the mapping).
    - Stable accounts (no surgery): single leg where reg_used == reg_orig
      throughout, pulled directly from the repaired AUM.
    - Branched accounts (at least one genuine surgery): multiple legs,
      reg_used shows which physical code was active at each date.
    The output makes every account's full path explicit:
        reg_orig | reg_used      | date       | isin | qty_aum | ...
        ─────────┼───────────────┼────────────┼──────┼─────────┼───
        REG_001  | REG_001       | 2020-01-31 | ...  | ...     |      <- stable
        REG_002  | REG_002_OLD   | 2020-01-31 | ...  | ...     |      <- leg 1
        REG_002  | REG_002       | 2022-07-31 | ...  | ...     |      <- leg 2
    Self-mapped surgeries (reg_from == reg_to) are noted in the summary
    but do not add extra legs — the account kept its code.
    Returns the paths DataFrame (never None if mapping is non-empty).
    """
    # All canonical accounts in the universe
    all_accounts = sorted(mapping["reg_orig"].astype(str).unique())
    # Branched accounts (genuine code changes only)
    branched_accounts = set()
    if not surgery.empty:
        genuine = surgery[surgery["reg_from"] != surgery["reg_to"]]
        branched_accounts = set(genuine["reg_orig"].astype(str).unique())
    print(
        f"\n[Paths] {len(all_accounts)} account(s) in universe, "
        f"{len(branched_accounts)} branched: "
        f"{sorted(branched_accounts) or 'none'}"
    )
    # Build (date, reg_orig) → reg_used lookup from mapping
    map_df = mapping[["date", "reg_orig", "reg_used"]].copy()
    map_df["date"] = pd.to_datetime(map_df["date"])
    map_df["reg_orig"] = map_df["reg_orig"].astype(str)
    map_df["reg_used"] = map_df["reg_used"].astype(str)
    map_df = map_df.rename(columns={"date": "_date_key", "reg_orig": "_reg_key"})
    # Pull all universe rows from the repaired AUM
    aum_universe = repaired[
        repaired["Registrar Account - ID"].astype(str).isin(all_accounts)
    ].copy()
    aum_universe["Centralisation Date"] = pd.to_datetime(
        aum_universe["Centralisation Date"]
    )
    aum_universe["_date_key"] = aum_universe["Centralisation Date"]
    aum_universe["_reg_key"] = aum_universe["Registrar Account - ID"].astype(str)
    # Join reg_used from mapping
    paths = aum_universe.merge(
        map_df[["_date_key", "_reg_key", "reg_used"]],
        on=["_date_key", "_reg_key"],
        how="left",
    ).drop(columns=["_date_key", "_reg_key"])
    # For stable accounts, mapping may not cover every AUM date (e.g. sparse
    # months) — fall back to reg_orig (= Registrar Account - ID) for those.
    paths["reg_used"] = paths["reg_used"].fillna(
        paths["Registrar Account - ID"].astype(str)
    )
    # Rename canonical column
    paths = paths.rename(columns={"Registrar Account - ID": "reg_orig"})
    # Column order
    other_cols = [c for c in paths.columns if c not in ("reg_orig", "reg_used")]
    paths = paths[["reg_orig", "reg_used"] + other_cols]
    paths = paths.sort_values(["reg_orig", "Centralisation Date", "Product - Isin"])
    paths = paths.reset_index(drop=True)
    # Summary
    for acc in all_accounts:
        sub = paths[paths["reg_orig"] == acc]
        legs = list(sub["reg_used"].unique())
        tag = " [branched]" if acc in branched_accounts else " [stable]"
        print(f"  {acc}: {len(sub)} rows, legs = {legs}{tag}")
    return paths
 # ─────────────────────────────────────────────────────────────
 # MAIN
 # ─────────────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(
        description="Apply Carmignac repair mapping to the raw AUM file"
    )
    parser.add_argument(
        "--mapping",
        default="repair_results/carmignac_mapping.csv",
        help="Path to mapping CSV from carmignac_repair.py",
    )
    parser.add_argument(
        "--surgery",
        default="repair_results/carmignac_surgery_log.csv",
        help="Path to surgery log CSV (optional, for audit)",
    )
    parser.add_argument(
        "--out", default="AUM_repaired.csv", help="Output path for repaired AUM CSV"
    )
    parser.add_argument(
        "--audit",
        default="AUM_repair_audit.csv",
        help="Output path for audit CSV (renamed rows only)",
    )
    parser.add_argument(
        "--paths",
        default="AUM_paths.csv",
        help="Output path for condensed paths CSV (branched accounts only)",
    )
    args = parser.parse_args()
    def resolve(p, required=True):
        if os.path.exists(p):
            return p
        alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
        if os.path.exists(alt):
            return alt
        if required:
            sys.exit(f"[ERROR] File not found: {p}")
        return None
    mapping_path = resolve(args.mapping)
    surgery_path = resolve(args.surgery, required=False)
    print("=" * 60)
    print("CARMIGNAC — AUM Branching / Repair")
    print("=" * 60)
    print(f"  Mapping : {mapping_path}")
    print(f"  Surgery : {surgery_path or '(not provided)'}")
    # Load
    aum, mapping, surgery = load_inputs_branch(mapping_path, surgery_path)
    print(f"\n  Raw AUM rows           : {len(aum)}")
    print(f"  Mapping rows           : {len(mapping)}")
    print(f"  Mapping changed rows   : {mapping['changed'].sum()}")
    print(f"  Surgery operations     : {len(surgery)}")
    # Build lookup
    lookup = build_rename_lookup(mapping)
    print(f"\n  Rename operations      : {len(lookup)}")
    if lookup:
        sample = list(lookup.items())[:3]
        for (d, used), orig in sample:
            print(f"    ({d.date()}, {used}) → {orig}")
        if len(lookup) > 3:
            print(f"    ... and {len(lookup) - 3} more")
    # Apply
    repaired, audit = apply_branching(aum, lookup)
    print(f"\n  Rows renamed           : {len(audit)}")
    # Checks
    consistency_check(aum, repaired, mapping, surgery)
    # Save
    out_dir = os.path.dirname(os.path.abspath(args.out))
    os.makedirs(out_dir, exist_ok=True)
    repaired.to_csv(args.out, index=False)
    print(f"\n  ✓ Repaired AUM  → {args.out}")
    if len(audit) > 0:
        audit.to_csv(args.audit, index=False)
        print(f"  ✓ Audit log     → {args.audit}")
    else:
        print("(No rows renamed — audit log not written)")
    # Paths: condensed AUM for branched accounts
    df_paths = export_paths(aum, mapping, surgery, repaired)
    if df_paths is not None:
        df_paths.to_csv(args.paths, index=False)
        print(f"  ✓ Paths file    → {args.paths}")
    # Print renamed reg_ids summary
    if len(audit) > 0:
        print("\n[Renamed identifiers]")
        summary = (
            audit.groupby(["original_reg_id", "canonical_reg_id"])
            .size()
            .reset_index(name="n_rows")
        )
        for _, row in summary.iterrows():
            print(
                f"  {row['original_reg_id']:20s} → {row['canonical_reg_id']:20s} "
                f"({row['n_rows']} rows)"
            )
    print("\nDone.")
 if __name__ == "__main__":
    main()
--- a/src/repair_challenge/carmignac_diagnostics.py
+++ b/src/repair_challenge/carmignac_diagnostics.py
@ -0,0 +1,632 @@
 """
 Broken Months Diagnostics
 =====================================================
 Detects months where the aggregate stock-flow equation is violated at the ISIN level (across all accounts)
 The residual is the "missing flow":
    missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
 This is a market-level check, independent of individual account identity.
 It captures:
  - Genuinely missing flow records
  - End-of-month accounting lags (transactions dated at boundary)
  - Corporate actions (dividends, splits) not reflected in flows
 Outputs
 -------
  carmignac_broken_months.csv   — machine-readable, loaded by carmignac_repair.py
  carmignac_diagnostics.html    — interactive HTML report
 Usage
 -----
    python carmignac_diagnostics.py
    python carmignac_diagnostics.py \\
        --aum   raw_AUM.csv \\
        --flows raw_flows.csv \\
        --out   carmignac_broken_months.csv \\
        --html  carmignac_diagnostics.html \\
        --alpha 0.02
 """
 import argparse
 import os
 import sys
 import numpy as np
 import pandas as pd
 from helpers import build_html_diagnostics, load_data_diagnostics
 # ─────────────────────────────────────────────────────────────
 # AGGREGATE AND DETECT BROKEN MONTHS
 # ─────────────────────────────────────────────────────────────
 def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
    """
    For each (isin, month-end t), compute:
      - Q_agg(t)      : total shares held across all accounts
      - Q_agg(t-1)    : idem previous month (forward-filled)
      - F_agg(t)      : total net flows recorded in ]EOM(t-1), EOM(t)]
      - missing(t)    : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
      - missing_pct   : |missing| / max(Q_agg(t), Q_agg(t-1))
    A month is flagged as "broken" when missing_pct > alpha.
    Additionally, a month is flagged as a potential "lag" when:
      - It is broken with the standard window
      - But would NOT be broken if flows dated within lag_days of EOM
        are shifted to the adjacent month
    Parameters :
      alpha     : tolerance threshold (same as ALPHA in carmignac_repair.py)
      lag_days  : number of boundary days to test for accounting lag
    Returns :
      df_broken   : DataFrame with all (isin, date) pairs where missing_pct > alpha
      df_all      : Full DataFrame including non-broken months (for plotting)
    """
    # Monthly calendar
    t_min = aum["Centralisation Date"].min()
    t_max = aum["Centralisation Date"].max()
    all_months = pd.date_range(t_min, t_max, freq="ME")
    # ── Aggregate AUM per (isin, month-end) ──────────────────────
    aum_agg = (
        aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
        .sum()
        .reset_index()
        .rename(
            columns={
                "Product - Isin": "isin",
                "Centralisation Date": "date",
                "Quantity - AUM": "qty_agg",
            }
        )
    )
    # Forward-fill sparse panel
    aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
    aum_pivot = aum_pivot.reindex(all_months).ffill()
    # ── Aggregate flows per (isin, month-end) — standard window ──
    def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
        """Aggregate flows with optional boundary extension (in days)."""
        fc = flows_df.copy()
        def assign_month(d):
            # Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
            for m in months:
                eom_prev = m - pd.offsets.MonthEnd(1)
                lo = eom_prev - pd.Timedelta(days=lower_offset)
                hi = m + pd.Timedelta(days=upper_offset)
                if lo < d <= hi:
                    return m
            return pd.NaT
        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
        fc = fc.dropna(subset=["month_end"])
        agg = (
            fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
            .sum()
            .reset_index()
            .rename(
                columns={
                    "Product - Isin": "isin",
                    "month_end": "date",
                    "Quantity - NetFlows": "flow_agg",
                }
            )
        )
        return agg
    flows_std = bucket_flows(flows, all_months)
    flows_lag = bucket_flows(
        flows, all_months, lower_offset=lag_days, upper_offset=lag_days
    )
    def flows_to_pivot(df, months):
        piv = df.pivot(index="date", columns="isin", values="flow_agg")
        return piv.reindex(months).fillna(0.0)
    fpiv_std = flows_to_pivot(flows_std, all_months)
    fpiv_lag = flows_to_pivot(flows_lag, all_months)
    # ── Compute residuals ─────────────────────────────────────────
    rows = []
    isins = aum_pivot.columns.tolist()
    for i in range(1, len(all_months)):
        t_curr = all_months[i]
        t_prev = all_months[i - 1]
        for isin in isins:
            q_curr = (
                aum_pivot[isin].get(t_curr, np.nan)
                if isin in aum_pivot.columns
                else np.nan
            )
            q_prev = (
                aum_pivot[isin].get(t_prev, np.nan)
                if isin in aum_pivot.columns
                else np.nan
            )
            if pd.isna(q_curr) or pd.isna(q_prev):
                continue
            delta = q_curr - q_prev
            # Standard window
            f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
            missing_std = delta - f_std
            # Extended lag window
            f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
            missing_lag = delta - f_lag
            # ── Denominator choice ────────────────────────────────
            # Normalise by the size of the *movement* (max of delta_AUM
            # and recorded flow), not by the stock level.  This avoids
            # astronomically large percentages when a position is tiny
            # but the missing flow is a normal-sized number.
            #
            # Interpretation: "what fraction of the expected movement
            # is unaccounted for?"
            #
            # A minimum absolute threshold (min_abs_shares) suppresses
            # noise from residual micro-positions (rounding artefacts).
            min_abs_shares = 1.0  # ignore positions smaller than 1 share
            movement = max(abs(delta), abs(f_std), min_abs_shares)
            denom_std = movement
            movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
            denom_lag = movement_lag
            pct_std = abs(missing_std) / denom_std
            pct_lag = abs(missing_lag) / denom_lag
            broken_std = pct_std > alpha
            broken_lag = pct_lag > alpha
            # A "lag" month: broken with standard, NOT broken with extended window
            is_lag = broken_std and (not broken_lag)
            rows.append(
                {
                    "date": t_curr,
                    "isin": isin,
                    "q_agg_prev": round(q_prev, 3),
                    "q_agg_curr": round(q_curr, 3),
                    "delta_aum": round(delta, 3),
                    "flow_agg": round(f_std, 3),
                    "missing_flow": round(missing_std, 3),
                    "missing_pct": round(pct_std, 6),
                    "broken": broken_std,
                    "is_lag": is_lag,
                }
            )
    df_all = pd.DataFrame(rows)
    df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
    return df_broken, df_all
 # ─────────────────────────────────────────────────────────────
 # AGGREGATE (CROSS-ISIN) BROKEN MONTHS
 # ─────────────────────────────────────────────────────────────
 def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
    """
    Same stock-flow check as detect_broken_months, but aggregated
    across ALL ISINs for each month:
        Q_total(t) - Q_total(t-1)  !=  F_total(t)
    where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
    This catches months where the global portfolio is incoherent even
    if every individual ISIN is fine (e.g. cross-ISIN netting errors),
    and provides a cleaner high-level view.
    Returns :
      df_agg : DataFrame indexed by month with columns:
          q_total_prev, q_total_curr, delta_aum,
          flow_total, missing_flow, missing_pct, broken, is_lag
    """
    t_min = aum["Centralisation Date"].min()
    t_max = aum["Centralisation Date"].max()
    all_months = pd.date_range(t_min, t_max, freq="ME")
    # ── Total AUM per month (all ISIN, all accounts) ─────────────
    aum_monthly = (
        aum.groupby("Centralisation Date")["Quantity - AUM"]
        .sum()
        .reindex(all_months)
        .ffill()
        .rename("q_total")
    )
    # ── Bucket flows helper (reuse same window logic) ─────────────
    def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
        fc = flows_df.copy()
        def assign_month(d):
            for m in months:
                eom_prev = m - pd.offsets.MonthEnd(1)
                lo = eom_prev - pd.Timedelta(days=lower_offset)
                hi = m + pd.Timedelta(days=upper_offset)
                if lo < d <= hi:
                    return m
            return pd.NaT
        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
        fc = fc.dropna(subset=["month_end"])
        return (
            fc.groupby("month_end")["Quantity - NetFlows"]
            .sum()
            .reindex(months)
            .fillna(0.0)
        )
    flow_std = bucket_total_flows(flows, all_months)
    flow_lag = bucket_total_flows(
        flows, all_months, lower_offset=lag_days, upper_offset=lag_days
    )
    # ── Compute residuals ─────────────────────────────────────────
    rows = []
    min_abs_shares = 1.0
    for i in range(1, len(all_months)):
        t_curr = all_months[i]
        t_prev = all_months[i - 1]
        q_curr = aum_monthly.get(t_curr, np.nan)
        q_prev = aum_monthly.get(t_prev, np.nan)
        if pd.isna(q_curr) or pd.isna(q_prev):
            continue
        delta = q_curr - q_prev
        f_std = flow_std.get(t_curr, 0.0)
        f_lag = flow_lag.get(t_curr, 0.0)
        miss_std = delta - f_std
        miss_lag = delta - f_lag
        movement_std = max(abs(delta), abs(f_std), min_abs_shares)
        movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
        pct_std = abs(miss_std) / movement_std
        pct_lag = abs(miss_lag) / movement_lag
        broken_std = pct_std > alpha
        broken_lag = pct_lag > alpha
        is_lag = broken_std and (not broken_lag)
        rows.append(
            {
                "date": t_curr,
                "q_total_prev": round(q_prev, 3),
                "q_total_curr": round(q_curr, 3),
                "delta_aum": round(delta, 3),
                "flow_total": round(f_std, 3),
                "missing_flow": round(miss_std, 3),
                "missing_pct": round(pct_std, 6),
                "broken": broken_std,
                "is_lag": is_lag,
            }
        )
    df_agg = pd.DataFrame(rows)
    return df_agg
 # ─────────────────────────────────────────────────────────────
 # ERROR ACCOUNT
 # ─────────────────────────────────────────────────────────────
 def build_error_account(aum, flows, lag_days=3):
    """
    Builds a synthetic "error account" that absorbs the stock-flow
    residuals that cannot be explained by recorded flows.
    Construction (backwards from t_ref):
        Stock_error(t_ref)  = 0   (by definition)
        Stock_error(t-1)    = Stock_error(t) - Residual(t)
    where Residual(t) = [Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)] - Σ_r F_{r,s}(t)
    for each ISIN s independently.
    By construction, adding this error account to the AUM restores the
    stock-flow equality at every (isin, month).
    Also computes an aggregated error account (summed over all ISINs).
    Returns
    -------
    df_err_isin : DataFrame with columns
        (date, isin, residual, stock_error, stock_error_pct)
        where stock_error_pct = stock_error / max(total_isin_aum, 1)
    df_err_agg  : DataFrame with columns
        (date, residual_agg, stock_error_agg, stock_error_agg_pct)
    """
    t_min = aum["Centralisation Date"].min()
    t_max = aum["Centralisation Date"].max()
    all_months = pd.date_range(t_min, t_max, freq="ME")
    # ── ISIN-level AUM panel (forward-filled) ────────────────────
    aum_agg = (
        aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
        .sum()
        .reset_index()
        .rename(
            columns={
                "Product - Isin": "isin",
                "Centralisation Date": "date",
                "Quantity - AUM": "qty",
            }
        )
    )
    aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty")
    aum_pivot = aum_pivot.reindex(all_months).ffill()
    # ── ISIN-level flow aggregation (standard window) ─────────────
    def bucket_isin_flows(flows_df, months):
        fc = flows_df.copy()
        def assign_month(d):
            for m in months:
                eom_prev = m - pd.offsets.MonthEnd(1)
                if eom_prev < d <= m:
                    return m
            return pd.NaT
        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
        fc = fc.dropna(subset=["month_end"])
        return (
            fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
            .sum()
            .unstack("Product - Isin")
            .reindex(months)
            .fillna(0.0)
        )
    flow_pivot = bucket_isin_flows(flows, all_months)
    # ── Compute residuals per (isin, month) ───────────────────────
    isins = aum_pivot.columns.tolist()
    # residual[t] = delta_AUM[t] - flow[t]
    residuals = {}  # {isin: Series indexed by month}
    for isin in isins:
        res_series = {}
        for i in range(1, len(all_months)):
            t_curr = all_months[i]
            t_prev = all_months[i - 1]
            q_curr = aum_pivot[isin].get(t_curr, np.nan)
            q_prev = aum_pivot[isin].get(t_prev, np.nan)
            if pd.isna(q_curr) or pd.isna(q_prev):
                continue
            delta = q_curr - q_prev
            f = flow_pivot[isin].get(t_curr, 0.0) if isin in flow_pivot.columns else 0.0
            res_series[t_curr] = delta - f
        residuals[isin] = pd.Series(res_series)
    # ── Build error stock backwards from t_ref ────────────────────
    t_ref = all_months[-1]
    rows_isin = []
    for isin in isins:
        res = residuals[isin]
        # Maximum AUM for this ISIN (for normalisation)
        max_aum = aum_pivot[isin].max()
        if pd.isna(max_aum) or max_aum < 1:
            max_aum = 1.0
        # Propagate backwards: stock(t_ref) = 0
        stock = 0.0
        # Build dict keyed by date
        stock_by_date = {t_ref: 0.0}
        for i in range(len(all_months) - 2, -1, -1):
            t_curr = all_months[i + 1]
            t_prev = all_months[i]
            r = res.get(t_curr, 0.0)
            stock = stock - r
            stock_by_date[t_prev] = stock
        for t in all_months:
            s = stock_by_date.get(t, np.nan)
            r = res.get(t, 0.0)
            rows_isin.append(
                {
                    "date": t,
                    "isin": isin,
                    "residual": round(r, 4),
                    "stock_error": round(s, 4) if not pd.isna(s) else np.nan,
                    "stock_error_pct": round(abs(s) / max_aum * 100, 4)
                    if not pd.isna(s)
                    else np.nan,
                }
            )
    df_err_isin = pd.DataFrame(rows_isin).sort_values(["date", "isin"])
    # ── Aggregated error account ──────────────────────────────────
    # Total AUM across all ISINs at each month
    total_aum_by_month = aum_pivot.sum(axis=1)
    max_total_aum = total_aum_by_month.max()
    if pd.isna(max_total_aum) or max_total_aum < 1:
        max_total_aum = 1.0
    # Aggregate residual = sum of ISIN residuals
    agg_res = {}
    for i in range(1, len(all_months)):
        t_curr = all_months[i]
        total_r = sum(residuals[isin].get(t_curr, 0.0) for isin in isins)
        agg_res[t_curr] = total_r
    agg_stock = 0.0
    agg_stock_by_date = {t_ref: 0.0}
    for i in range(len(all_months) - 2, -1, -1):
        t_curr = all_months[i + 1]
        t_prev = all_months[i]
        agg_stock = agg_stock - agg_res.get(t_curr, 0.0)
        agg_stock_by_date[t_prev] = agg_stock
    rows_agg = []
    for t in all_months:
        s = agg_stock_by_date.get(t, np.nan)
        r = agg_res.get(t, 0.0)
        rows_agg.append(
            {
                "date": t,
                "residual_agg": round(r, 4),
                "stock_error_agg": round(s, 4) if not pd.isna(s) else np.nan,
                "stock_error_agg_pct": round(abs(s) / max_total_aum * 100, 4)
                if not pd.isna(s)
                else np.nan,
            }
        )
    df_err_agg = pd.DataFrame(rows_agg).sort_values("date")
    return df_err_isin, df_err_agg
 # ─────────────────────────────────────────────────────────────
 # PRINT SUMMARY
 # ─────────────────────────────────────────────────────────────
 def print_summary(df_broken, df_all, alpha):
    total = len(df_all)
    n_broken = len(df_broken)
    n_lag = df_broken["is_lag"].sum()
    print("\n" + "=" * 60)
    print("  CARMIGNAC — Broken Months Diagnostics")
    print("=" * 60)
    print(f"  (isin, month) pairs examined : {total}")
    print(
        f"  Broken (missing_pct > {alpha:.0%})  : {n_broken} "
        f"({n_broken / total * 100:.1f}%)"
    )
    print(f"  Of which likely lag           : {n_lag}")
    print(f"  Of which genuine gap          : {n_broken - n_lag}")
    if n_broken:
        print("\n  Top 10 by missing_pct:")
        cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
        print(df_broken[cols].head(10).to_string(index=False))
    # Monthly breakdown
    by_month = (
        df_broken.groupby("date")
        .agg(
            n_broken=("isin", "count"),
            total_missing=("missing_flow", lambda x: x.abs().sum()),
        )
        .sort_values("n_broken", ascending=False)
        .head(5)
    )
    if len(by_month):
        print("\n  Most affected months:")
        print(by_month.to_string())
    print()
 # ─────────────────────────────────────────────────────────────
 # MAIN
 # ─────────────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(
        description="Detect broken months in Carmignac AUM/Flows data"
    )
    parser.add_argument(
        "--out",
        default="carmignac_broken_months.csv",
        help="Machine-readable output (loaded by carmignac_repair.py)",
    )
    parser.add_argument("--html", default="carmignac_diagnostics.html")
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Tolerance threshold (default 0.05 = 5%%)",
    )
    parser.add_argument(
        "--lag",
        type=int,
        default=3,
        help="Boundary days to test for accounting lag (default 3)",
    )
    args = parser.parse_args()
    def resolve(p):
        if os.path.exists(p):
            return p
        alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
        if os.path.exists(alt):
            return alt
        sys.exit(f"[ERROR] File not found: {p}")
    print("[Load] AUM")
    print("[Load] Flows")
    aum, flows = load_data_diagnostics()
    print(
        f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)..."
    )
    df_broken, df_all = detect_broken_months(
        aum, flows, alpha=args.alpha, lag_days=args.lag
    )
    df_agg = detect_aggregate_broken_months(
        aum, flows, alpha=args.alpha, lag_days=args.lag
    )
    print("\n[Error account] Building error account...")
    df_err_isin, df_err_agg = build_error_account(aum, flows, lag_days=args.lag)
    print_summary(df_broken, df_all, args.alpha)
    n_agg_broken = int(df_agg["broken"].sum())
    print(
        f"  Aggregate broken months      : {n_agg_broken} "
        f"(of which lags: {int(df_agg['is_lag'].sum())})"
    )
    max_err = float(df_err_agg["stock_error_agg"].abs().max())
    print(
        f"  Max aggregate error stock    : {max_err:,.1f} shares "
        f"({float(df_err_agg['stock_error_agg_pct'].max()):.3f}% of total AUM)"
    )
    # CSV output — this is what carmignac_repair.py loads
    if len(df_broken):
        df_broken.to_csv(args.out, index=False)
        print(f"[Export] Broken months CSV     → {args.out}")
    else:
        pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(
            args.out, index=False
        )
        print(f"[Export] No broken months — empty CSV → {args.out}")
    # Error account CSV
    err_out = args.out.replace("broken_months", "error_account")
    df_err_isin.to_csv(err_out, index=False)
    err_agg_out = err_out.replace("error_account", "error_account_agg")
    df_err_agg.to_csv(err_agg_out, index=False)
    print(f"[Export] Error account (ISIN)  → {err_out}")
    print(f"[Export] Error account (agg)   → {err_agg_out}")
    html = build_html_diagnostics(
        df_broken, df_all, df_agg, df_err_isin, df_err_agg, args.alpha
    )
    with open(args.html, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"[Export] HTML report       → {args.html}")
 if __name__ == "__main__":
    main()
--- a/src/repair_challenge/carmignac_repair.py
+++ b/src/repair_challenge/carmignac_repair.py
@ -0,0 +1,995 @@
 """
 Registrar ID Repair Pipeline
 =========================================================
 Étape 1 : Filtrage & univers de référence à t=31/10/2025
 Étape 2 : Score de cohérence temporelle (propagation vers le passé)
 Étape 3 : Chirurgie de code (matching 1-to-1)
 À appliquer après le diagnostic de broken months
 """
 import os
 import pandas as pd
 import numpy as np
 from helpers import load_data_repair
 # ─────────────────────────────────────────────
 # PARAMÈTRES
 # ─────────────────────────────────────────────
 ALPHA = 0.05  # tolérance réconciliation : 5% du stock à t
 MIN_AUM_EUR = 5e6  # seuil filtrage étape 1
 MIN_JACCARD = 0.3  # seuil minimal similarité portefeuille pour chirurgie
 SCORE_DROP_THRESHOLD = 0.15  # si score chute de >15% → candidat chirurgie
 MAX_SURGERY_LOOKBACK = 6  # remonter jusqu'à 6 mois en arrière pour trouver un candidat
 SYMMETRY_ATTENUATION = (
    0.05  # facteur d'atténuation si rupture symétrique détectée (cas 1/3)
 )
 # ── Broken months ──────────────────────────────────────────────
 # Attenuation factor applied to reconciliation errors on months flagged
 # as "broken" by carmignac_diagnostics.py.  On a broken month the error
 # is multiplied by this factor before degrading the score, so a genuine
 # data-quality problem at market level does not unfairly penalise an
 # account.  Set to 1.0 to disable attenuation.
 BROKEN_MONTH_ATTENUATION = 0.2  # reduce error to 20% on broken months
 # ── Accounting lag window ──────────────────────────────────────
 # Transactions dated within this many days of a month-end boundary are
 # considered "boundary" flows.  When the standard-window reconciliation
 # fails but the lag-adjusted reconciliation passes, the error is
 # attenuated (same factor as broken months).
 LAG_ATTENUATION = 0.1  # reduce error to 10% on likely lag months
 # ── Fenêtre de chirurgie étendue ───────────────────────────────
 # Quand aucun bon candidat n'est trouvé à t-1, la chirurgie remonte
 # jusqu'à MAX_SURGERY_LOOKBACK mois en arrière.  Pour chaque mois k
 # supplémentaire, le score composite est multiplié par un facteur de
 # confiance décroissant : confidence(k) = 1 - (k-1)/MAX_SURGERY_LOOKBACK.
 # Carmignac suggère 6 mois (délai maximal de résolution des transferts
 # asymétriques, lié au cycle de paiement des rétrocessions trimestrielles).
 MAX_SURGERY_LOOKBACK = 6
 EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
 # ─────────────────────────────────────────────
 # CHARGEMENT
 # ─────────────────────────────────────────────
 def load_broken_months(broken_months_path):
    """
    Loads the broken-months CSV produced by carmignac_diagnostics.py.
    Returns a set of (date, isin) tuples flagged as broken, and a
    separate set flagged as likely accounting lags.
    """
    if not broken_months_path or not os.path.exists(broken_months_path):
        print("Could not find the path")
        return set(), set()
    try:
        df = pd.read_csv(broken_months_path, parse_dates=["date"])
        broken = set(zip(pd.to_datetime(df["date"]), df["isin"].astype(str)))
        lags = set(
            zip(
                pd.to_datetime(df.loc[df["is_lag"], "date"]),
                df.loc[df["is_lag"], "isin"].astype(str),
            )
        )
        print(
            f"[Broken months] Loaded {len(broken)} flagged (isin, month) pairs "
            f"({len(lags)} likely lags)"
        )
        return broken, lags
    except Exception as e:
        print(f"[Broken months] Could not load '{broken_months_path}': {e}")
        return set(), set()
 # ─────────────────────────────────────────────
 # ÉTAPE 1 — Univers de référence à T_REF
 # ─────────────────────────────────────────────
 def build_reference_universe(aum, t_ref=None):
    """
    Construit l'univers de référence à t_ref (dernière date par défaut).
    Retourne :
      - aum_ref  : AUM à t_ref pour chaque (reg_id, isin)
      - weights  : poids normalisé par reg_id
      - universe : ensemble des reg_id retenus (>= MIN_AUM_EUR)
    """
    if t_ref is None:
        t_ref = aum["date"].max()
    print(f"\n[Étape 1] Date de référence : {t_ref.date()}")
    # Exclure Off Distribution / Private Clients (sur région ou nom)
    mask_excl = aum["reg_id"].isin(EXCLUDE_REGISTRAR)
    if "region" in aum.columns:
        mask_excl |= aum["region"].isin(EXCLUDE_REGISTRAR)
    aum_clean = aum[~mask_excl].copy()
    # AUM à t_ref
    aum_ref = aum_clean[aum_clean["date"] == t_ref][
        ["reg_id", "isin", "qty_aum", "val_eur"]
    ].copy()
    # AUM total par reg_id à t_ref
    aum_by_reg = aum_ref.groupby("reg_id")["val_eur"].sum().rename("total_eur")
    # Filtrage >= MIN_AUM_EUR
    universe = set(aum_by_reg[aum_by_reg >= MIN_AUM_EUR].index)
    total_eur_universe = aum_by_reg[aum_by_reg.index.isin(universe)].sum()
    total_eur_all = aum_by_reg.sum()
    coverage = total_eur_universe / total_eur_all if total_eur_all > 0 else 0
    print(f"  Registrar IDs à t_ref          : {len(aum_by_reg)}")
    print(f"  Dont >= {MIN_AUM_EUR / 1e6:.0f}M€                : {len(universe)}")
    print(f"  Couverture encours              : {coverage:.1%}")
    # Poids initiaux (scores à t_ref)
    weights = (
        aum_by_reg[aum_by_reg.index.isin(universe)] / total_eur_universe
    ).to_dict()
    return aum_ref, weights, universe, t_ref
 # ─────────────────────────────────────────────
 # 3. PANEL AUM MENSUEL (forward-fill)
 # ─────────────────────────────────────────────
 def build_monthly_panel(aum, universe, t_ref):
    """
    Construit un panel mensuel complet (forward-fill des quantités AUM)
    pour TOUS les reg_ids présents dans l'historique AUM — y compris les codes
    historiques hors univers de référence, nécessaires pour la chirurgie.
    """
    # Toutes les fin de mois entre la première date et t_ref
    date_min = aum["date"].min()
    all_months = pd.date_range(start=date_min, end=t_ref, freq="ME")
    # Pivot : (reg_id, isin) → série temporelle de qty_aum
    aum_sorted = aum.sort_values(["reg_id", "isin", "date"])
    # On ne garde que les lignes jusqu'à t_ref
    aum_sorted = aum_sorted[aum_sorted["date"] <= t_ref]
    # Multi-index pivot
    panel = aum_sorted.pivot_table(
        index="date", columns=["reg_id", "isin"], values="qty_aum", aggfunc="last"
    )
    # Réindexer sur toutes les fins de mois
    panel = panel.reindex(all_months)
    # Forward-fill : si pas de mouvement, la quantité reste la même
    panel = panel.ffill()
    # Backward-fill initial pour les comptes qui démarrent après la première date
    # (on ne remonte pas avant leur première apparition → on garde NaN)
    print(
        f"\n[Panel mensuel] {len(all_months)} mois, {panel.shape[1]} (reg_id, isin) paires"
    )
    return panel, all_months
 # ─────────────────────────────────────────────
 # 4. FLOWS AGRÉGÉS PAR MOIS
 # ─────────────────────────────────────────────
 def aggregate_flows_monthly(flows, all_months, lag_days=3):
    """
    Agrège les flows infra-mensuels sur chaque fenêtre ]fin_mois(t-1), fin_mois(t)].
    Retourne deux DataFrames indexés par (fin_mois, reg_id, isin) :
      - monthly_flows      : agrégation standard (fenêtre exacte)
      - monthly_flows_lag  : agrégation avec fenêtre élargie de ±lag_days jours
                             autour de chaque fin de mois.  Utilisé pour détecter
                             les ruptures dues à un décalage comptable de fin de mois.
    """
    flows_f = flows[flows["date"] <= all_months[-1]].copy()
    def assign_month(d, lower_offset=0, upper_offset=0):
        for m in all_months:
            eom_prev = m - pd.offsets.MonthEnd(1)
            lo = eom_prev - pd.Timedelta(days=lower_offset)
            hi = m + pd.Timedelta(days=upper_offset)
            if lo < d <= hi:
                return m
        return pd.NaT
    # Standard window
    flows_f["month_end"] = flows_f["date"].apply(lambda d: assign_month(d))
    flows_std = flows_f.dropna(subset=["month_end"])
    monthly_flows = (
        flows_std.groupby(["month_end", "reg_id", "isin"])["qty_net"]
        .sum()
        .reset_index()
    )
    monthly_flows.columns = ["date", "reg_id", "isin", "qty_net_month"]
    # Lag window (±lag_days around each EOM)
    flows_f2 = flows[flows["date"] <= all_months[-1]].copy()
    flows_f2["month_end"] = flows_f2["date"].apply(
        lambda d: assign_month(d, lower_offset=lag_days, upper_offset=lag_days)
    )
    flows_lag = flows_f2.dropna(subset=["month_end"])
    monthly_flows_lag = (
        flows_lag.groupby(["month_end", "reg_id", "isin"])["qty_net"]
        .sum()
        .reset_index()
    )
    monthly_flows_lag.columns = ["date", "reg_id", "isin", "qty_net_month"]
    print(
        f"\n[Flows mensuels] {len(monthly_flows)} enregistrements (standard) | "
        f"{len(monthly_flows_lag)} (lag window ±{lag_days}d)"
    )
    return monthly_flows, monthly_flows_lag
 # ─────────────────────────────────────────────
 # ÉTAPE 2 — Score de cohérence temporelle
 # ─────────────────────────────────────────────
 def compute_reconciliation_error(qty_t_minus1, qty_t, net_flow, alpha=ALPHA):
    """
    Calcule l'erreur de réconciliation normalisée pour un (reg_id, isin, mois).
    Attendu : qty_t_minus1 + net_flow ≈ qty_t
    Erreur   : |qty_t_minus1 + net_flow - qty_t| / max(|qty_t|, |qty_t_minus1|)
    Retourne :
      - err_ratio  : erreur relative (0 = parfait)
      - is_break   : True si err_ratio > alpha
    """
    denom = max(abs(qty_t), abs(qty_t_minus1), 1e-9)
    err = abs(qty_t_minus1 + net_flow - qty_t)
    err_ratio = err / denom
    return err_ratio, err_ratio > alpha
 def score_propagation(
    panel,
    monthly_flows,
    monthly_flows_lag,
    weights,
    universe,
    all_months,
    broken_months=None,
    lag_months=None,
 ):
    """
    Propage les scores de t_ref vers t=0 (passé).
    À chaque mois t (en remontant), pour chaque reg_id dans l'univers courant :
      - Calculer l'erreur de réconciliation pondérée par ISIN
      - Dégrader le score proportionnellement
      - Atténuer l'erreur si le mois est flagué comme "broken" ou "lag"
    broken_months : set of (date, isin) pairs flagged as broken by diagnostics
    lag_months    : subset of broken_months identified as likely accounting lags
    Retourne :
      - scores_history : dict {date → {reg_id → score}}
      - errors_history : dict {date → {reg_id → err_pondérée}}
      - mapping        : dict {reg_id_original → reg_id_courant} (après chirurgie)
    """
    broken_months = broken_months or set()
    lag_months = lag_months or set()
    # Initialisation
    scores = dict(weights)  # scores à t_ref
    scores_history = {all_months[-1]: dict(scores)}
    errors_history = {}
    # Mapping actuel (identité au départ)
    mapping = {r: r for r in universe}
    # Flows indexés pour accès rapide
    flows_idx = monthly_flows.set_index(["date", "reg_id", "isin"])["qty_net_month"]
    flows_idx_lag = monthly_flows_lag.set_index(["date", "reg_id", "isin"])[
        "qty_net_month"
    ]
    # ── Pré-calcul des AUM agrégés par (isin, mois) pour détection de symétrie ──
    # Pour chaque (isin, t), on calcule la somme des variations de stock par compte.
    # Une rupture symétrique = un compte perd X parts sur un ISIN, un autre en gagne X.
    # On détecte cela via le résidu net agrégé : si faible → symétrie probable.
    # Structure : {(t_curr, isin) → {reg_id → delta_qty}}
    # Calculé à la volée dans la boucle, pas en pré-calcul (trop mémoire pour 400 comptes).
    # Remonter dans le temps
    for i in range(len(all_months) - 2, -1, -1):
        t_prev = all_months[i]
        t_curr = all_months[i + 1]
        # ── Détection de ruptures symétriques à ce pas de temps ──────
        # Pour chaque ISIN, calculer la variation de stock par compte.
        # Si la somme des variations positives ≈ somme des variations négatives
        # → il y a probablement compensation (cas 1 ou 3, pas de perte nette).
        # On stocke pour chaque (reg_id, isin) si sa rupture est symétrique.
        symmetric_breaks = set()  # ensemble de (reg_id, isin) à atténuer
        for reg in panel.columns.get_level_values(0):
            for isin in panel[reg].columns:
                q_t = panel[reg][isin].get(t_curr, np.nan)
                q_prev = panel[reg][isin].get(t_prev, np.nan)
                if pd.isna(q_t) or pd.isna(q_prev):
                    continue
                try:
                    f = flows_idx.loc[(t_curr, reg, isin)]
                except KeyError:
                    f = 0.0
                residual = (q_t - q_prev) - f
                if abs(residual) < ALPHA * max(abs(q_t), abs(q_prev), 1e-9):
                    continue  # pas de rupture sur ce compte/ISIN
        # Agrégation par ISIN : si le résidu net agrégé est petit,
        # les ruptures individuelles se compensent → symétrie.
        isin_residuals = {}
        isin_total_abs = {}
        for reg in panel.columns.get_level_values(0):
            for isin in panel[reg].columns:
                q_t = panel[reg][isin].get(t_curr, np.nan)
                q_prev = panel[reg][isin].get(t_prev, np.nan)
                if pd.isna(q_t) or pd.isna(q_prev):
                    continue
                try:
                    f = flows_idx.loc[(t_curr, reg, isin)]
                except KeyError:
                    f = 0.0
                residual = (q_t - q_prev) - f
                denom = max(abs(q_t), abs(q_prev), 1e-9)
                err = abs(residual) / denom
                if err < ALPHA:
                    continue
                isin_residuals[isin] = isin_residuals.get(isin, 0.0) + residual
                isin_total_abs[isin] = isin_total_abs.get(isin, 0.0) + abs(residual)
        # Un ISIN est "symétrique" si le résidu net < 20% du résidu brut total
        # (les erreurs individuelles s'annulent en grande partie)
        symmetric_isins = set()
        for isin, net in isin_residuals.items():
            total = isin_total_abs.get(isin, 0.0)
            if total > 0 and abs(net) / total < 0.20:
                symmetric_isins.add(isin)
        errors_at_t = {}
        new_scores = {}
        for reg_orig, reg_curr in mapping.items():
            score_curr = scores.get(reg_orig, 0)
            if score_curr == 0:
                new_scores[reg_orig] = 0
                continue
            # ISIN détenus par ce reg à t_curr (après mapping)
            if reg_curr in panel.columns.get_level_values(0):
                isin_list = panel[reg_curr].columns.tolist()
            else:
                # reg_curr n'existe pas du tout dans le panel → rupture totale
                new_scores[reg_orig] = 0
                errors_at_t[reg_orig] = 1.0
                continue
            total_aum_t = 0
            weighted_err = 0
            valid_isin_count = 0
            all_nan_at_prev = True  # détecte si le compte n'existait pas à t_prev
            for isin in isin_list:
                qty_t = panel[reg_curr][isin].get(t_curr, np.nan)
                qty_t_prev = panel[reg_curr][isin].get(t_prev, np.nan)
                if pd.isna(qty_t):
                    continue
                if not pd.isna(qty_t_prev):
                    all_nan_at_prev = False
                if pd.isna(qty_t_prev):
                    # ISIN existait à t_curr mais pas à t_prev → rupture sur cet ISIN
                    # On le traite comme une erreur maximale pondérée par son AUM
                    weight_isin = abs(qty_t)
                    weighted_err += 1.0 * weight_isin
                    total_aum_t += weight_isin
                    valid_isin_count += 1
                    continue
                if qty_t == 0 and qty_t_prev == 0:
                    continue
                # Flow agrégé sur ]t_prev, t_curr]
                try:
                    net_flow = flows_idx.loc[(t_curr, reg_curr, isin)]
                except KeyError:
                    net_flow = 0.0
                err_ratio, is_break = compute_reconciliation_error(
                    qty_t_prev, qty_t, net_flow, alpha=ALPHA
                )
                # ── Attenuation on broken / lag / symmetric months ───
                # Priority: symmetric > broken > lag
                if err_ratio > 0:
                    key = (t_curr, isin)
                    if isin in symmetric_isins:
                        # Rupture compensée à l'agrégé → cas 1 ou 3,
                        # pas de perte nette de données → atténuation forte
                        err_ratio = err_ratio * SYMMETRY_ATTENUATION
                    elif key in broken_months or key in lag_months:
                        # Try lag-window flow to distinguish lag vs genuine gap
                        try:
                            net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
                        except KeyError:
                            net_flow_lag = net_flow
                        err_lag, _ = compute_reconciliation_error(
                            qty_t_prev, qty_t, net_flow_lag, alpha=ALPHA
                        )
                        # Use whichever flow window gives the smaller error,
                        # then attenuate the result
                        best_err = min(err_ratio, err_lag)
                        attenuation = (
                            BROKEN_MONTH_ATTENUATION
                            if key in broken_months
                            else LAG_ATTENUATION
                        )
                        err_ratio = best_err * attenuation
                # Pondération par AUM à t_curr
                weight_isin = abs(qty_t)
                weighted_err += err_ratio * weight_isin
                total_aum_t += weight_isin
                valid_isin_count += 1
            if total_aum_t > 0 and valid_isin_count > 0:
                avg_err = weighted_err / total_aum_t
            else:
                avg_err = 0.0
            errors_at_t[reg_orig] = avg_err
            # Dégradation du score : score(t-1) = score(t) * (1 - err_pondérée)
            # Clippée entre 0 et score_curr
            degradation = min(avg_err, 1.0)
            new_scores[reg_orig] = score_curr * (1.0 - degradation)
        scores = new_scores
        scores_history[t_prev] = dict(scores)
        errors_history[t_prev] = dict(errors_at_t)
        total_score = sum(scores.values())
        print(
            f"  {t_prev.date()} | Σ scores = {total_score:.4f} | "
            f"Comptes actifs = {sum(1 for v in scores.values() if v > 0)}"
        )
    return scores_history, errors_history, mapping
 # ─────────────────────────────────────────────
 # ÉTAPE 3 — Chirurgie de code
 # ─────────────────────────────────────────────
 def jaccard_isin(set_a, set_b):
    """Coefficient de Jaccard entre deux ensembles d'ISIN."""
    if not set_a or not set_b:
        return 0.0
    inter = len(set_a & set_b)
    union = len(set_a | set_b)
    return inter / union if union > 0 else 0.0
 def find_best_candidate(
    reg_orig,
    reg_curr,
    t_prev,
    t_curr,
    panel,
    flows_idx,
    all_regs_at_t_prev,
    mapping_inv,
 ):
    """
    Pour un reg_id dont le score a fortement chuté, cherche le meilleur
    candidat j à t_prev tel que :
      - j n'est pas déjà mappé à un autre compte original
      - Le portefeuille ISIN de j à t_prev est similaire à celui de reg_curr à t_curr
      - La réconciliation est bonne
    Retourne (best_candidate, best_score_composite) ou (None, 0)
    """
    # ISIN du compte cible à t_curr
    if reg_curr not in panel.columns.get_level_values(0):
        return None, 0.0
    isin_curr = set(
        panel[reg_curr]
        .columns[
            panel[reg_curr].loc[t_curr].notna() & (panel[reg_curr].loc[t_curr] != 0)
        ]
        .tolist()
    )
    if not isin_curr:
        return None, 0.0
    best_candidate = None
    best_composite = 0.0
    for j in all_regs_at_t_prev:
        # Ne pas réutiliser un code déjà mappé
        if j in mapping_inv:
            continue
        # Ne pas mapper sur soi-même si déjà présent
        if j == reg_curr:
            continue
        if j not in panel.columns.get_level_values(0):
            continue
        # ISIN de j à t_prev
        col_j = panel[j]
        isin_j = (
            set(
                col_j.columns[
                    col_j.loc[t_prev].notna() & (col_j.loc[t_prev] != 0)
                ].tolist()
            )
            if t_prev in col_j.index
            else set()
        )
        if not isin_j:
            continue
        jac = jaccard_isin(isin_curr, isin_j)
        if jac < MIN_JACCARD:
            continue
        # Erreur de réconciliation pour les ISIN communs
        common_isin = isin_curr & isin_j
        total_aum = 0
        weighted_err = 0
        for isin in common_isin:
            qty_t = (
                panel[reg_curr][isin].get(t_curr, np.nan)
                if isin in panel[reg_curr].columns
                else np.nan
            )
            qty_t_prev = (
                panel[j][isin].get(t_prev, np.nan)
                if isin in panel[j].columns
                else np.nan
            )
            if pd.isna(qty_t) or pd.isna(qty_t_prev):
                continue
            try:
                net_flow = flows_idx.loc[(t_curr, j, isin)]
            except KeyError:
                net_flow = 0.0
            err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
            weight_isin = abs(qty_t)
            weighted_err += err_ratio * weight_isin
            total_aum += weight_isin
        avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
        composite = jac * (1.0 - min(avg_err, 1.0))
        if composite > best_composite:
            best_composite = composite
            best_candidate = j
    return best_candidate, best_composite
 def _recompute_score_with_candidate(
    reg_orig, candidate, t_prev, t_curr, panel, flows_idx, score_curr
 ):
    """
    Recalcule proprement l'erreur de réconciliation pour un candidat donné,
    et retourne le score après chirurgie.
    """
    if candidate not in panel.columns.get_level_values(0):
        return score_curr * 0  # candidat inexistant
    isin_list_cand = panel[candidate].columns.tolist()
    isin_list_curr = (
        panel[reg_orig].columns.tolist()
        if reg_orig in panel.columns.get_level_values(0)
        else []
    )
    total_aum = 0
    weighted_err = 0
    for isin in isin_list_curr:
        qty_t = (
            panel[reg_orig][isin].get(t_curr, np.nan)
            if isin in panel[reg_orig].columns
            else np.nan
        )
        if pd.isna(qty_t) or qty_t == 0:
            continue
        qty_t_prev = (
            panel[candidate][isin].get(t_prev, np.nan)
            if isin in panel[candidate].columns
            else np.nan
        )
        try:
            net_flow = flows_idx.loc[(t_curr, candidate, isin)]
        except KeyError:
            net_flow = 0.0
        if pd.isna(qty_t_prev):
            err_ratio = 1.0
        else:
            err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
        weight_isin = abs(qty_t)
        weighted_err += err_ratio * weight_isin
        total_aum += weight_isin
    avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
    return score_curr * (1.0 - min(avg_err, 1.0))
 def run_surgery_pass(
    scores_history,
    errors_history,
    panel,
    monthly_flows,
    monthly_flows_lag,
    weights,
    universe,
    all_months,
    broken_months=None,
    lag_months=None,
 ):
    """
    Deuxième passe : pour chaque mois avec des ruptures fortes,
    tente une chirurgie de code et recalcule les scores.
    Corrections par rapport à la passe naïve :
    - Après chirurgie, le score est recalculé proprement (pas juste composite)
    - Le mapping propagé en arrière utilise le bon code à chaque étape
    - Pré-filtre ISIN pour performance sur grand dataset
    Retourne :
      - mapping_history : {date → {reg_orig → reg_used}}
      - surgery_log     : liste des opérations effectuées
      - scores_final    : scores au dernier mois
    """
    flows_idx = monthly_flows.set_index(["date", "reg_id", "isin"])["qty_net_month"]
    flows_idx_lag = monthly_flows_lag.set_index(["date", "reg_id", "isin"])[
        "qty_net_month"
    ]
    # Tous les reg_ids présents dans le panel (univers + codes historiques)
    all_regs_in_panel = set(panel.columns.get_level_values(0))
    # Pré-calcul : ensemble d'ISIN par reg_id à chaque date (pour pré-filtre rapide)
    # {reg_id → {date → set(isin)}}
    reg_isin_at_date = {}
    for reg in all_regs_in_panel:
        reg_isin_at_date[reg] = {}
        col = panel[reg]
        for date in col.index:
            active = set(
                col.columns[(col.loc[date].notna()) & (col.loc[date] != 0)].tolist()
            )
            if active:
                reg_isin_at_date[reg][date] = active
    # Mapping courant : reg_orig → reg_used
    mapping = {r: r for r in universe}
    mapping_inv = {r: r for r in universe}
    surgery_log = []
    mapping_history = {all_months[-1]: dict(mapping)}
    scores_history_corrected = {all_months[-1]: dict(weights)}
    # Scores courants (initialisés à t_ref)
    scores = dict(weights)
    for i in range(len(all_months) - 2, -1, -1):
        t_prev = all_months[i]
        t_curr = all_months[i + 1]
        new_scores = {}
        new_mapping = {}
        for reg_orig in list(mapping.keys()):
            reg_curr = mapping[reg_orig]
            score_curr = scores.get(reg_orig, 0)
            if score_curr == 0:
                new_scores[reg_orig] = 0
                new_mapping[reg_orig] = reg_curr
                continue
            # Erreur sans chirurgie (depuis étape 2)
            err = errors_history.get(t_prev, {}).get(reg_orig, 0.0)
            score_prev_no_surgery = score_curr * (1.0 - min(err, 1.0))
            drop_ratio = (
                1.0 - (score_prev_no_surgery / score_curr) if score_curr > 0 else 0
            )
            if drop_ratio > SCORE_DROP_THRESHOLD:
                # ── ISIN du compte courant à t_curr (pour pré-filtre) ──
                isin_curr = reg_isin_at_date.get(reg_curr, {}).get(t_curr, set())
                # ── Candidats disponibles ──
                # On exclut les codes déjà mappés à un autre compte,
                # mais reg_curr lui-même est un candidat valide (self-mapping).
                available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
                best_candidate = None
                best_score_after = score_prev_no_surgery  # baseline = pas de chirurgie
                best_composite = 0.0
                best_lookback = 0  # nombre de mois remontés pour trouver ce candidat
                # ── Fenêtre de recherche étendue : jusqu'à MAX_SURGERY_LOOKBACK mois ──
                # On cherche d'abord à t-1 (k=1), puis t-2 … t-MAX si rien trouvé.
                # La confiance décroît avec la distance : confidence(k) = 1 - (k-1)/MAX
                for k in range(1, MAX_SURGERY_LOOKBACK + 1):
                    if i - (k - 1) < 0:
                        break  # on a atteint le début de l'historique
                    t_lookup = all_months[
                        i - (k - 1)
                    ]  # date candidate = t_prev - (k-1)
                    confidence = 1.0 - (k - 1) / MAX_SURGERY_LOOKBACK
                    for j in available:
                        # Pré-filtre rapide : overlap ISIN minimal
                        isin_j = reg_isin_at_date.get(j, {}).get(t_lookup, set())
                        if not isin_curr or not isin_j:
                            continue
                        inter = len(isin_curr & isin_j)
                        if inter == 0:
                            continue
                        jac = inter / len(isin_curr | isin_j)
                        if jac < MIN_JACCARD:
                            continue
                        # Score après chirurgie avec ce candidat à t_lookup
                        # (on utilise t_curr comme référence de stock, t_lookup comme prior)
                        score_after_raw = _recompute_score_with_candidate(
                            reg_curr, j, t_lookup, t_curr, panel, flows_idx, score_curr
                        )
                        # Appliquer le facteur de confiance lié à la distance temporelle
                        score_after = (
                            score_curr * confidence * (score_after_raw / score_curr)
                            if score_curr > 0
                            else score_after_raw
                        )
                        composite = (
                            jac * confidence * (score_after_raw / score_curr)
                            if score_curr > 0
                            else 0
                        )
                        if score_after > best_score_after:
                            best_score_after = score_after
                            best_candidate = j
                            best_composite = composite
                            best_lookback = k
                    # Si on a trouvé un bon candidat à cette distance, on s'arrête
                    if best_candidate is not None:
                        break
                if best_candidate:
                    lookback_note = (
                        f", lookback={best_lookback}m" if best_lookback > 1 else ""
                    )
                    surgery_log.append(
                        {
                            "date": t_prev,
                            "reg_orig": reg_orig,
                            "reg_from": reg_curr,
                            "reg_to": best_candidate,
                            "jaccard_composite": round(best_composite, 4),
                            "score_before": round(score_curr, 6),
                            "score_after": round(best_score_after, 6),
                            "drop_without_surgery": round(drop_ratio, 4),
                            "gain_vs_no_surgery": round(
                                best_score_after - score_prev_no_surgery, 6
                            ),
                            "lookback_months": best_lookback,
                        }
                    )
                    print(
                        f"  🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
                        f"{reg_curr} → {best_candidate} "
                        f"(composite={best_composite:.3f}, "
                        f"score {score_curr:.4f}→{best_score_after:.4f}"
                        f"{lookback_note})"
                    )
                    # Mise à jour mapping
                    if best_candidate != reg_curr:
                        if reg_curr in mapping_inv:
                            del mapping_inv[reg_curr]
                        mapping_inv[best_candidate] = reg_orig
                    new_mapping[reg_orig] = best_candidate
                    new_scores[reg_orig] = best_score_after
                else:
                    new_mapping[reg_orig] = reg_curr
                    new_scores[reg_orig] = score_prev_no_surgery
            else:
                new_mapping[reg_orig] = reg_curr
                new_scores[reg_orig] = score_prev_no_surgery
        mapping = new_mapping
        mapping_inv = {v: k for k, v in mapping.items()}
        scores = new_scores
        mapping_history[t_prev] = dict(mapping)
        scores_history_corrected[t_prev] = dict(scores)
        total_score = sum(s for s in scores.values() if not np.isnan(s))
        n_surgeries = sum(1 for op in surgery_log if op["date"] == t_prev)
        print(
            f"  {t_prev.date()} | Σ scores = {total_score:.4f} | "
            f"Chirurgies = {n_surgeries}"
        )
    return mapping_history, surgery_log, scores, scores_history_corrected
 # ─────────────────────────────────────────────
 # EXPORT RÉSULTATS
 # ─────────────────────────────────────────────
 def export_results(
    scores_history, mapping_history, surgery_log, all_months, out_prefix="carmignac"
 ):
    """Exporte les résultats clés en CSV."""
    # Score history
    rows = []
    for date, sc in scores_history.items():
        for reg, score in sc.items():
            rows.append({"date": date, "reg_id": reg, "score": score})
    df_scores = (
        pd.DataFrame(rows)
        if rows
        else pd.DataFrame(columns=["date", "reg_id", "score"])
    )
    if not df_scores.empty:
        df_scores = df_scores.sort_values(["date", "score"], ascending=[True, False])
    df_scores.to_csv(
        f"repair_challenge/repair_results/{out_prefix}_scores.csv", index=False
    )
    # Mapping history
    rows_m = []
    for date, mp in mapping_history.items():
        for reg_orig, reg_used in mp.items():
            rows_m.append(
                {
                    "date": date,
                    "reg_orig": reg_orig,
                    "reg_used": reg_used,
                    "changed": reg_orig != reg_used,
                }
            )
    df_mapping = (
        pd.DataFrame(rows_m)
        if rows_m
        else pd.DataFrame(columns=["date", "reg_orig", "reg_used", "changed"])
    )
    if not df_mapping.empty:
        df_mapping = df_mapping.sort_values(["date", "reg_orig"])
    df_mapping.to_csv(
        f"repair_challenge/repair_results/{out_prefix}_mapping.csv", index=False
    )
    # Surgery log
    if surgery_log:
        df_surgery = pd.DataFrame(surgery_log).sort_values("date")
        df_surgery.to_csv(
            f"repair_challenge/repair_results/{out_prefix}_surgery_log.csv", index=False
        )
        print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
    else:
        print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
    print(f"[Export] Scores    → {out_prefix}_scores.csv")
    print(f"[Export] Mapping   → {out_prefix}_mapping.csv")
    return df_scores, df_mapping
 # ─────────────────────────────────────────────
 # PIPELINE PRINCIPAL
 # ─────────────────────────────────────────────
 def run_pipeline(
    broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv",
 ):
    print("=" * 60)
    print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
    print("=" * 60)
    # Chargement
    aum, flows = load_data_repair()
    # Broken months (optional — produced by carmignac_diagnostics.py)
    broken_months, lag_months = load_broken_months(broken_months_path)
    # Étape 1 — Univers de référence
    aum_ref, weights, universe, t_ref = build_reference_universe(aum)
    print("\n  Top 5 comptes par poids :")
    for reg, w in sorted(weights.items(), key=lambda x: -x[1])[:5]:
        print(f"    {reg} : {w:.4f} ({w * 100:.2f}%)")
    # Panel mensuel
    panel, all_months = build_monthly_panel(aum, universe, t_ref)
    # Flows mensuels agrégés (standard + lag window)
    monthly_flows, monthly_flows_lag = aggregate_flows_monthly(flows, all_months)
    # Étape 2 — Score de cohérence (sans chirurgie)
    print("\n[Étape 2] Propagation des scores (sans chirurgie)...")
    scores_history, errors_history, _ = score_propagation(
        panel,
        monthly_flows,
        monthly_flows_lag,
        weights,
        universe,
        all_months,
        broken_months=broken_months,
        lag_months=lag_months,
    )
    # Étape 3 — Chirurgie
    print("\n[Étape 3] Passe de chirurgie...")
    mapping_history, surgery_log, final_scores, scores_history_corrected = (
        run_surgery_pass(
            scores_history,
            errors_history,
            panel,
            monthly_flows,
            monthly_flows_lag,
            weights,
            universe,
            all_months,
            broken_months=broken_months,
            lag_months=lag_months,
        )
    )
    # Export — on utilise les scores corrigés (post-chirurgie) comme référence
    print("\n[Export des résultats...]")
    df_scores, df_mapping = export_results(
        scores_history_corrected, mapping_history, surgery_log, all_months
    )
    # Résumé final
    print("\n" + "=" * 60)
    print("RÉSUMÉ FINAL")
    print("=" * 60)
    print(
        f"  Dates couvertes        : {all_months[0].date()} → {all_months[-1].date()}"
    )
    print(f"  Comptes dans l'univers : {len(universe)}")
    print(f"  Chirurgies effectuées  : {len(surgery_log)}")
    score_by_date = {
        d: sum(s for s in sc.values() if s == s)
        for d, sc in scores_history_corrected.items()
    }
    print(f"  Σ scores à t_ref       : {score_by_date[t_ref]:.4f}")
    print(f"  Σ scores à t_min       : {score_by_date[all_months[0]]:.4f}")
    return df_scores, df_mapping, surgery_log, scores_history_corrected, mapping_history
 if __name__ == "__main__":
    df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
        broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv"  # optional
    )
--- a/src/repair_challenge/diagnostics_results/alpha_2%/carmignac_diagnostics.html
+++ b/src/repair_challenge/diagnostics_results/alpha_2%/carmignac_diagnostics.html
--- a/src/repair_challenge/diagnostics_results/alpha_5%/carmignac_diagnostics.html
+++ b/src/repair_challenge/diagnostics_results/alpha_5%/carmignac_diagnostics.html
--- a/src/repair_challenge/helpers.py
+++ b/src/repair_challenge/helpers.py
--- a/src/repair_challenge/repair_results/carmignac_report.html
+++ b/src/repair_challenge/repair_results/carmignac_report.html
--- a/test_upload.py
+++ b/test_upload.py