2026-04-13 21:51:04 +02:00
7 changed files with 6620 additions and 809 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
-*.csv
+data/
+data_exploration/
+notebooks/
--- a/carmignac_broken_months.csv
+++ b/carmignac_broken_months.csv
--- a/carmignac_diagnostics.html
+++ b/carmignac_diagnostics.html
--- a/carmignac_diagnostics.py
+++ b/carmignac_diagnostics.py
@ -0,0 +1,962 @@
+"""
+Carmignac Data Challenge — Broken Months Diagnostics
+=====================================================
+Detects months where the aggregate stock-flow equation is violated
+at the ISIN level (across all accounts):
+
+    Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)  ≠  Σ_r F_{r,s}(t-1→t)
+
+The residual is the "missing flow":
+    missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
+
+This is a market-level check, independent of individual account identity.
+It captures:
+  - Genuinely missing flow records
+  - End-of-month accounting lags (transactions dated at boundary)
+  - Corporate actions (dividends, splits) not reflected in flows
+
+Outputs
+-------
+  carmignac_broken_months.csv   — machine-readable, loaded by carmignac_repair.py
+  carmignac_diagnostics.html    — interactive HTML report
+
+Usage
+-----
+    python carmignac_diagnostics.py
+    python carmignac_diagnostics.py \\
+        --aum   raw_AUM.csv \\
+        --flows raw_flows.csv \\
+        --out   carmignac_broken_months.csv \\
+        --html  carmignac_diagnostics.html \\
+        --alpha 0.02
+"""
+
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+
+# ─────────────────────────────────────────────────────────────
+# 1. LOAD
+# ─────────────────────────────────────────────────────────────
+
+def load_data(aum_path, flows_path):
+    aum   = pd.read_csv(aum_path,   parse_dates=["Centralisation Date"])
+    flows = pd.read_csv(flows_path, parse_dates=["Centralisation Date"])
+    aum["Product - Isin"]   = aum["Product - Isin"].astype(str)
+    flows["Product - Isin"] = flows["Product - Isin"].astype(str)
+    return aum, flows
+
+
+# ─────────────────────────────────────────────────────────────
+# 2. AGGREGATE AND DETECT BROKEN MONTHS
+# ─────────────────────────────────────────────────────────────
+
+def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
+    """
+    For each (isin, month-end t), compute:
+      - Q_agg(t)      : total shares held across all accounts
+      - Q_agg(t-1)    : idem previous month (forward-filled)
+      - F_agg(t)      : total net flows recorded in ]EOM(t-1), EOM(t)]
+      - missing(t)    : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
+      - missing_pct   : |missing| / max(Q_agg(t), Q_agg(t-1))
+
+    A month is flagged as "broken" when missing_pct > alpha.
+
+    Additionally, a month is flagged as a potential "lag" when:
+      - It is broken with the standard window
+      - But would NOT be broken if flows dated within lag_days of EOM
+        are shifted to the adjacent month
+
+    Parameters
+    ----------
+    alpha     : tolerance threshold (same as ALPHA in carmignac_repair.py)
+    lag_days  : number of boundary days to test for accounting lag
+
+    Returns
+    -------
+    df_broken   : DataFrame with all (isin, date) pairs where missing_pct > alpha
+    df_all      : Full DataFrame including non-broken months (for plotting)
+    """
+    # Monthly calendar
+    t_min = aum["Centralisation Date"].min()
+    t_max = aum["Centralisation Date"].max()
+    all_months = pd.date_range(t_min, t_max, freq="ME")
+
+    # ── Aggregate AUM per (isin, month-end) ──────────────────────
+    aum_agg = (
+        aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
+           .sum()
+           .reset_index()
+           .rename(columns={"Product - Isin": "isin",
+                             "Centralisation Date": "date",
+                             "Quantity - AUM": "qty_agg"})
+    )
+    # Forward-fill sparse panel
+    aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
+    aum_pivot = aum_pivot.reindex(all_months).ffill()
+
+    # ── Aggregate flows per (isin, month-end) — standard window ──
+    def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
+        """Aggregate flows with optional boundary extension (in days)."""
+        fc = flows_df.copy()
+        def assign_month(d):
+            # Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
+            for m in months:
+                eom_prev = m - pd.offsets.MonthEnd(1)
+                lo = eom_prev - pd.Timedelta(days=lower_offset)
+                hi = m + pd.Timedelta(days=upper_offset)
+                if lo < d <= hi:
+                    return m
+            return pd.NaT
+
+        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
+        fc = fc.dropna(subset=["month_end"])
+        agg = (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
+                 .sum()
+                 .reset_index()
+                 .rename(columns={"Product - Isin": "isin",
+                                  "month_end": "date",
+                                  "Quantity - NetFlows": "flow_agg"}))
+        return agg
+
+    flows_std = bucket_flows(flows, all_months)
+    flows_lag = bucket_flows(flows, all_months,
+                             lower_offset=lag_days,
+                             upper_offset=lag_days)
+
+    def flows_to_pivot(df, months):
+        piv = df.pivot(index="date", columns="isin", values="flow_agg")
+        return piv.reindex(months).fillna(0.0)
+
+    fpiv_std = flows_to_pivot(flows_std, all_months)
+    fpiv_lag = flows_to_pivot(flows_lag, all_months)
+
+    # ── Compute residuals ─────────────────────────────────────────
+    rows = []
+    isins = aum_pivot.columns.tolist()
+
+    for i in range(1, len(all_months)):
+        t_curr = all_months[i]
+        t_prev = all_months[i - 1]
+
+        for isin in isins:
+            q_curr = aum_pivot[isin].get(t_curr, np.nan) if isin in aum_pivot.columns else np.nan
+            q_prev = aum_pivot[isin].get(t_prev, np.nan) if isin in aum_pivot.columns else np.nan
+
+            if pd.isna(q_curr) or pd.isna(q_prev):
+                continue
+
+            delta = q_curr - q_prev
+
+            # Standard window
+            f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
+            missing_std = delta - f_std
+
+            # Extended lag window
+            f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
+            missing_lag = delta - f_lag
+
+            # ── Denominator choice ────────────────────────────────
+            # Normalise by the size of the *movement* (max of delta_AUM
+            # and recorded flow), not by the stock level.  This avoids
+            # astronomically large percentages when a position is tiny
+            # but the missing flow is a normal-sized number.
+            #
+            # Interpretation: "what fraction of the expected movement
+            # is unaccounted for?"  100% = the entire movement is missing.
+            #
+            # A minimum absolute threshold (min_abs_shares) suppresses
+            # noise from residual micro-positions (rounding artefacts).
+            min_abs_shares = 1.0   # ignore positions smaller than 1 share
+            movement = max(abs(delta), abs(f_std), min_abs_shares)
+            denom_std = movement
+
+            movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
+            denom_lag = movement_lag
+
+            pct_std = abs(missing_std) / denom_std
+            pct_lag = abs(missing_lag) / denom_lag
+
+            broken_std = pct_std > alpha
+            broken_lag = pct_lag > alpha
+
+            # A "lag" month: broken with standard, NOT broken with extended window
+            is_lag = broken_std and (not broken_lag)
+
+            rows.append({
+                "date":         t_curr,
+                "isin":         isin,
+                "q_agg_prev":   round(q_prev, 3),
+                "q_agg_curr":   round(q_curr, 3),
+                "delta_aum":    round(delta, 3),
+                "flow_agg":     round(f_std, 3),
+                "missing_flow": round(missing_std, 3),
+                "missing_pct":  round(pct_std, 6),
+                "broken":       broken_std,
+                "is_lag":       is_lag,
+            })
+
+    df_all    = pd.DataFrame(rows)
+    df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
+    return df_broken, df_all
+
+
+
+# ─────────────────────────────────────────────────────────────
+# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
+# ─────────────────────────────────────────────────────────────
+
+def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
+    """
+    Same stock-flow check as detect_broken_months, but aggregated
+    across ALL ISINs for each month:
+
+        Q_total(t) - Q_total(t-1)  !=  F_total(t)
+
+    where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
+
+    This catches months where the global portfolio is incoherent even
+    if every individual ISIN is fine (e.g. cross-ISIN netting errors),
+    and provides a cleaner high-level view.
+
+    Returns
+    -------
+    df_agg : DataFrame indexed by month with columns:
+        q_total_prev, q_total_curr, delta_aum, flow_total,
+        missing_flow, missing_pct, broken, is_lag
+    """
+    t_min = aum["Centralisation Date"].min()
+    t_max = aum["Centralisation Date"].max()
+    all_months = pd.date_range(t_min, t_max, freq="ME")
+
+    # ── Total AUM per month (all ISIN, all accounts) ─────────────
+    aum_monthly = (
+        aum.groupby("Centralisation Date")["Quantity - AUM"]
+           .sum()
+           .reindex(all_months)
+           .ffill()
+           .rename("q_total")
+    )
+
+    # ── Bucket flows helper (reuse same window logic) ─────────────
+    def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
+        fc = flows_df.copy()
+        def assign_month(d):
+            for m in months:
+                eom_prev = m - pd.offsets.MonthEnd(1)
+                lo = eom_prev - pd.Timedelta(days=lower_offset)
+                hi = m + pd.Timedelta(days=upper_offset)
+                if lo < d <= hi:
+                    return m
+            return pd.NaT
+        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
+        fc = fc.dropna(subset=["month_end"])
+        return (fc.groupby("month_end")["Quantity - NetFlows"]
+                  .sum()
+                  .reindex(months)
+                  .fillna(0.0))
+
+    flow_std = bucket_total_flows(flows, all_months)
+    flow_lag = bucket_total_flows(flows, all_months,
+                                  lower_offset=lag_days, upper_offset=lag_days)
+
+    # ── Compute residuals ─────────────────────────────────────────
+    rows = []
+    min_abs_shares = 1.0
+
+    for i in range(1, len(all_months)):
+        t_curr = all_months[i]
+        t_prev = all_months[i - 1]
+
+        q_curr = aum_monthly.get(t_curr, np.nan)
+        q_prev = aum_monthly.get(t_prev, np.nan)
+        if pd.isna(q_curr) or pd.isna(q_prev):
+            continue
+
+        delta = q_curr - q_prev
+
+        f_std     = flow_std.get(t_curr, 0.0)
+        f_lag     = flow_lag.get(t_curr, 0.0)
+        miss_std  = delta - f_std
+        miss_lag  = delta - f_lag
+
+        movement_std = max(abs(delta), abs(f_std), min_abs_shares)
+        movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
+        pct_std = abs(miss_std) / movement_std
+        pct_lag = abs(miss_lag) / movement_lag
+
+        broken_std = pct_std > alpha
+        broken_lag = pct_lag > alpha
+        is_lag     = broken_std and (not broken_lag)
+
+        rows.append({
+            "date":          t_curr,
+            "q_total_prev":  round(q_prev, 3),
+            "q_total_curr":  round(q_curr, 3),
+            "delta_aum":     round(delta, 3),
+            "flow_total":    round(f_std, 3),
+            "missing_flow":  round(miss_std, 3),
+            "missing_pct":   round(pct_std, 6),
+            "broken":        broken_std,
+            "is_lag":        is_lag,
+        })
+
+    df_agg = pd.DataFrame(rows)
+    return df_agg
+
+# ─────────────────────────────────────────────────────────────
+# 3. PRINT SUMMARY
+# ─────────────────────────────────────────────────────────────
+
+def print_summary(df_broken, df_all, alpha):
+    total = len(df_all)
+    n_broken = len(df_broken)
+    n_lag    = df_broken["is_lag"].sum()
+
+    print("\n" + "=" * 60)
+    print("  CARMIGNAC — Broken Months Diagnostics")
+    print("=" * 60)
+    print(f"  (isin, month) pairs examined : {total}")
+    print(f"  Broken (missing_pct > {alpha:.0%})  : {n_broken} "
+          f"({n_broken/total*100:.1f}%)")
+    print(f"  Of which likely lag           : {n_lag}")
+    print(f"  Of which genuine gap          : {n_broken - n_lag}")
+
+    if n_broken:
+        print("\n  Top 10 by missing_pct:")
+        cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
+        print(df_broken[cols].head(10).to_string(index=False))
+
+    # Monthly breakdown
+    by_month = (df_broken.groupby("date")
+                         .agg(n_broken=("isin", "count"),
+                              total_missing=("missing_flow", lambda x: x.abs().sum()))
+                         .sort_values("n_broken", ascending=False)
+                         .head(5))
+    if len(by_month):
+        print("\n  Most affected months:")
+        print(by_month.to_string())
+    print()
+
+
+# ─────────────────────────────────────────────────────────────
+# 4. BUILD HTML REPORT
+# ─────────────────────────────────────────────────────────────
+
+def build_html(df_broken, df_all, df_agg, alpha):
+    # ── JS-ready data ────────────────────────────────────────────
+    # Timeline: n_broken and total_missing per month
+    tl = (df_all[df_all["broken"]]
+          .groupby("date")
+          .agg(n_broken=("isin", "count"),
+               total_missing=("missing_flow", lambda x: x.abs().sum()),
+               n_lag=("is_lag", "sum"))
+          .reindex(df_all["date"].sort_values().unique())
+          .fillna(0))
+    tl.index = pd.to_datetime(tl.index)
+    dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index])
+
+    def jf(arr, dec=4):
+        return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
+
+    n_broken_js     = jf(tl["n_broken"].values, 0)
+    total_miss_js   = jf(tl["total_missing"].values)
+    n_lag_js        = jf(tl["n_lag"].values, 0)
+
+    # Aggregate (cross-ISIN) JS data
+    agg_dates_str  = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])])
+    agg_delta_js   = jf(df_agg["delta_aum"].values)
+    agg_flow_js    = jf(df_agg["flow_total"].values)
+    agg_missing_js = jf(df_agg["missing_flow"].values)
+    agg_pct_js     = jf((df_agg["missing_pct"] * 100).values)
+
+    # Aggregate KPIs
+    n_agg_broken = int(df_agg["broken"].sum())
+    n_agg_lag    = int(df_agg["is_lag"].sum())
+    n_agg_genuine = n_agg_broken - n_agg_lag
+    max_agg_pct  = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0
+
+    # Aggregate detail table rows
+    agg_rows = []
+    for _, r in df_agg[df_agg["broken"]].iterrows():
+        lb  = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
+        pc  = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
+        ds  = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
+        mc  = "miss-neg" if r["missing_flow"] < 0 else "miss-pos"
+        agg_rows.append(
+            f'<tr><td>{ds}</td>'
+            f'<td class="mono right">{r["q_total_prev"]:,.1f}</td>'
+            f'<td class="mono right">{r["q_total_curr"]:,.1f}</td>'
+            f'<td class="mono right">{r["flow_total"]:,.1f}</td>'
+            f'<td class="mono right {mc}">{r["missing_flow"]:+,.1f}</td>'
+            f'<td class="mono right {pc}">{r["missing_pct"]*100:.2f}%</td>'
+            f'<td>{lb}</td></tr>'
+        )
+    agg_detail_rows = "".join(agg_rows) if agg_rows else (
+        '<tr><td colspan="7" style="padding:24px;text-align:center;'
+        'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
+    )
+
+    # Per-ISIN summary
+    isin_sum = (df_broken.groupby("isin")
+                         .agg(n_months=("date", "count"),
+                              avg_pct=("missing_pct", "mean"),
+                              total_abs=("missing_flow", lambda x: x.abs().sum()))
+                         .sort_values("total_abs", ascending=False))
+
+    ISIN_COLORS = [
+        "#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
+        "#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
+    ]
+
+    # Per-ISIN missing_pct timeseries for the top 5 ISINs
+    top_isins = isin_sum.head(5).index.tolist()
+    all_dates = sorted(df_all["date"].unique())
+    isin_ts_datasets = []
+    for idx, isin in enumerate(top_isins):
+        sub = df_all[df_all["isin"] == isin].set_index("date")["missing_pct"].reindex(all_dates).fillna(0)
+        isin_ts_datasets.append({
+            "label":           isin,
+            "data":            [round(float(v) * 100, 3) for v in sub.values],
+            "borderColor":     ISIN_COLORS[idx % len(ISIN_COLORS)],
+            "backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
+            "borderWidth":     2,
+            "pointRadius":     0,
+            "tension":         0.3,
+            "fill":            False,
+        })
+    isin_ts_json  = json.dumps(isin_ts_datasets)
+    all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
+                                 else str(d)[:10] for d in all_dates])
+
+    # Detail table rows
+    detail_rows = ""
+    for _, r in df_broken.head(200).iterrows():
+        lag_badge = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
+        pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
+        detail_rows += f"""
+        <tr>
+          <td>{r['date'].strftime('%Y-%m-%d') if hasattr(r['date'], 'strftime') else str(r['date'])[:10]}</td>
+          <td class="mono">{r['isin']}</td>
+          <td class="mono right">{r['q_agg_prev']:,.1f}</td>
+          <td class="mono right">{r['q_agg_curr']:,.1f}</td>
+          <td class="mono right">{r['flow_agg']:,.1f}</td>
+          <td class="mono right {'miss-neg' if r['missing_flow'] < 0 else 'miss-pos'}">{r['missing_flow']:+,.1f}</td>
+          <td class="mono right {pct_class}">{r['missing_pct']*100:.2f}%</td>
+          <td>{lag_badge}</td>
+        </tr>"""
+
+    # ISIN summary table
+    isin_rows = ""
+    for isin, row in isin_sum.iterrows():
+        isin_rows += f"""
+        <tr>
+          <td class="mono">{isin}</td>
+          <td class="mono right">{int(row['n_months'])}</td>
+          <td class="mono right">{row['avg_pct']*100:.2f}%</td>
+          <td class="mono right">{row['total_abs']:,.1f}</td>
+        </tr>"""
+
+    # KPIs
+    total       = len(df_all)
+    n_broken_kpi = len(df_broken)
+    n_lag_kpi    = int(df_broken["is_lag"].sum())
+    n_genuine    = n_broken_kpi - n_lag_kpi
+    max_pct      = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
+    n_isins      = df_broken["isin"].nunique()
+
+    no_broken_msg = ""
+    if n_broken_kpi == 0:
+        no_broken_msg = '<div class="no-broken">✓ No broken months detected at this threshold.</div>'
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>Carmignac — Broken Months Diagnostics</title>
+<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
+<style>
+  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;600;700&display=swap');
+
+  :root {{
+    --bg:      #0d0f12; --surface: #151820; --border: #252a35;
+    --accent:  #3b82f6; --warn:    #f59e0b; --danger: #ef4444;
+    --success: #10b981; --text:    #e2e8f0; --muted:  #64748b;
+    --mono: 'IBM Plex Mono', monospace;
+    --sans: 'IBM Plex Sans', sans-serif;
+  }}
+  *, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
+  body {{ font-family: var(--sans); background: var(--bg); color: var(--text);
+          padding: 0 0 60px; }}
+
+  .header {{ background: linear-gradient(135deg,#0d1117,#111827,#1a0a0a);
+             border-bottom: 1px solid var(--border); padding: 40px 48px 36px; }}
+  .header-eyebrow {{ font-family: var(--mono); font-size: 11px; letter-spacing:.15em;
+                     color: var(--danger); text-transform: uppercase; margin-bottom:10px; }}
+  .header h1 {{ font-size: 2rem; font-weight: 700; letter-spacing:-.02em; margin-bottom:8px; }}
+  .header-sub {{ font-size:.85rem; color: var(--muted); font-family: var(--mono); }}
+
+  .kpi-strip {{ display: grid; grid-template-columns: repeat(auto-fit,minmax(160px,1fr));
+               gap: 1px; background: var(--border); border-bottom: 1px solid var(--border); }}
+  .kpi {{ background: var(--surface); padding: 22px 28px;
+          display: flex; flex-direction: column; gap: 4px; }}
+  .kpi-label {{ font-size:.7rem; letter-spacing:.1em; text-transform:uppercase;
+                color: var(--muted); font-family: var(--mono); }}
+  .kpi-value {{ font-size:1.6rem; font-weight:700; font-family: var(--mono); line-height:1; }}
+  .kpi-value.danger  {{ color: var(--danger); }}
+  .kpi-value.warn    {{ color: var(--warn); }}
+  .kpi-value.success {{ color: var(--success); }}
+  .kpi-sub {{ font-size:.7rem; color: var(--muted); font-family: var(--mono); }}
+
+  .main {{ max-width:1400px; margin:0 auto; padding:36px 48px;
+           display:flex; flex-direction:column; gap:32px; }}
+
+  .card {{ background: var(--surface); border: 1px solid var(--border);
+           border-radius:8px; overflow:hidden; }}
+  .card-header {{ padding:18px 24px 14px; border-bottom:1px solid var(--border);
+                  display:flex; align-items:baseline; gap:12px; }}
+  .card-title {{ font-size:.8rem; font-weight:600; letter-spacing:.1em;
+                 text-transform:uppercase; color: var(--muted); font-family: var(--mono); }}
+  .card-desc {{ font-size:.78rem; color: #475569; }}
+  .card-body {{ padding:24px; }}
+  .chart-wrap {{ position:relative; height:260px; }}
+  .chart-wrap-tall {{ position:relative; height:320px; }}
+
+  .grid-2 {{ display:grid; grid-template-columns:1fr 1fr; gap:24px; }}
+  @media(max-width:900px) {{ .grid-2 {{ grid-template-columns:1fr; }}
+                              .main  {{ padding:24px 20px; }} }}
+
+  .section-label {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.15em;
+                    text-transform:uppercase; color: var(--muted);
+                    padding-left:10px; border-left:3px solid var(--danger);
+                    margin-bottom:-8px; }}
+
+  table {{ width:100%; border-collapse:collapse; font-size:.82rem; }}
+  th {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.08em;
+        text-transform:uppercase; color: var(--muted); padding:10px 14px;
+        text-align:left; border-bottom:1px solid var(--border); background:#0f1218; }}
+  td {{ padding:10px 14px; border-bottom:1px solid #1a1f2a; vertical-align:middle; }}
+  tr:last-child td {{ border-bottom:none; }}
+  tr:hover td {{ background:#181e2b; }}
+  .mono {{ font-family: var(--mono); font-size:.78rem; }}
+  .right {{ text-align:right; }}
+  .miss-pos {{ color: var(--warn); }}
+  .miss-neg {{ color: var(--danger); }}
+  .pct-high {{ color: var(--danger); font-weight:600; }}
+  .pct-med  {{ color: var(--warn); }}
+  .lag-badge {{ font-family: var(--mono); font-size:.65rem; padding:2px 6px;
+                background:#f59e0b22; border:1px solid #f59e0b66; border-radius:3px;
+                color: var(--warn); }}
+  .no-broken {{ padding:40px; text-align:center; color: var(--success);
+               font-family: var(--mono); font-size:.9rem; }}
+
+  .footer {{ text-align:center; font-family: var(--mono); font-size:.68rem;
+             color:#334155; margin-top:16px; letter-spacing:.05em; }}
+  .alpha-note {{ font-family: var(--mono); font-size:.75rem; color: var(--muted);
+                 padding:10px 24px 0; }}
+</style>
+</head>
+<body>
+
+<div class="header">
+  <div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
+  <h1>Broken Months Diagnostics</h1>
+  <div class="header-sub">
+    Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
+    <span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level</span>
+  </div>
+</div>
+
+<div class="kpi-strip">
+  <div class="kpi">
+    <span class="kpi-label">(ISIN, month) pairs</span>
+    <span class="kpi-value">{total:,}</span>
+    <span class="kpi-sub">examined</span>
+  </div>
+  <div class="kpi">
+    <span class="kpi-label">Broken months</span>
+    <span class="kpi-value {'danger' if n_broken_kpi > 0 else 'success'}">{n_broken_kpi:,}</span>
+    <span class="kpi-sub">{n_broken_kpi/total*100:.1f}% of pairs</span>
+  </div>
+  <div class="kpi">
+    <span class="kpi-label">Likely lags</span>
+    <span class="kpi-value warn">{n_lag_kpi}</span>
+    <span class="kpi-sub">resolved by ±{3}d window</span>
+  </div>
+  <div class="kpi">
+    <span class="kpi-label">Genuine gaps</span>
+    <span class="kpi-value {'danger' if n_genuine > 0 else 'success'}">{n_genuine}</span>
+    <span class="kpi-sub">unresolved by lag fix</span>
+  </div>
+  <div class="kpi">
+    <span class="kpi-label">ISINs affected</span>
+    <span class="kpi-value">{n_isins}</span>
+    <span class="kpi-sub">distinct ISINs</span>
+  </div>
+  <div class="kpi">
+    <span class="kpi-label">Max missing %</span>
+    <span class="kpi-value {'danger' if max_pct > 10 else 'warn'}">{max_pct:.1f}%</span>
+    <span class="kpi-sub">worst single (isin, month)</span>
+  </div>
+</div>
+
+<div class="main">
+
+  <div class="section-label">00 · Aggregate view — all ISINs combined</div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">Stock-flow equation — total portfolio</span>
+      <span class="card-desc">
+        Σ Q(t) − Σ Q(t−1) vs Σ F(t) across all ISINs and accounts.
+        Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
+      </span>
+    </div>
+    <div class="card-body">
+      <div class="chart-wrap-tall"><canvas id="chartAggOverlay"></canvas></div>
+    </div>
+  </div>
+
+  <div class="grid-2">
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Aggregate missing flow over time</span>
+        <span class="card-desc">Σ Q(t) − Σ Q(t−1) − Σ F(t) — should be near zero every month</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartAggMissing"></canvas></div>
+      </div>
+    </div>
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Aggregate missing % of movement</span>
+        <span class="card-desc">|missing| / max(|ΔAUM|, |flow|) — months above α flagged in red</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartAggPct"></canvas></div>
+      </div>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">Aggregate broken months — detail</span>
+    </div>
+    <div class="card-body" style="padding:0">
+      <table>
+        <thead><tr>
+          <th>Date</th>
+          <th class="right">Σ Q(t−1)</th><th class="right">Σ Q(t)</th>
+          <th class="right">Σ Flow</th><th class="right">Missing</th>
+          <th class="right">Missing %</th><th></th>
+        </tr></thead>
+        <tbody>{agg_detail_rows}</tbody>
+      </table>
+    </div>
+  </div>
+
+  <div class="section-label">01 · Timeline — per ISIN</div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">Broken (isin, month) pairs per month</span>
+      <span class="card-desc">Stacked: genuine gaps (red) vs likely accounting lags (amber)</span>
+    </div>
+    <div class="card-body">
+      <div class="chart-wrap-tall"><canvas id="chartTimeline"></canvas></div>
+    </div>
+  </div>
+
+  <div class="grid-2">
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Total absolute missing flow per month</span>
+        <span class="card-desc">Sum of |missing flow| across all broken ISINs</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartMissing"></canvas></div>
+      </div>
+    </div>
+
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Missing % — top 5 ISINs over time</span>
+        <span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
+      </div>
+    </div>
+  </div>
+
+  <div class="section-label">02 · By ISIN</div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">ISIN summary — most affected</span>
+    </div>
+    <div class="card-body" style="padding:0">
+      {'<div class="no-broken">No broken months detected.</div>' if n_broken_kpi == 0 else f"""
+      <table>
+        <thead><tr>
+          <th>ISIN</th><th>Broken months</th>
+          <th>Avg missing %</th><th>Total |missing| (shares)</th>
+        </tr></thead>
+        <tbody>{isin_rows}</tbody>
+      </table>"""}
+    </div>
+  </div>
+
+  <div class="section-label">03 · Detail log</div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">All broken (isin, month) pairs</span>
+      <span class="card-desc">
+        <span class="lag-badge">lag</span> = likely resolved by extending flow window ±3 days
+      </span>
+    </div>
+    <div class="alpha-note">Threshold α = {alpha:.1%} · showing up to 200 rows</div>
+    <div class="card-body" style="padding:0">
+      {'<div class="no-broken">✓ No broken months detected at this threshold.</div>' if n_broken_kpi == 0 else f"""
+      <table>
+        <thead><tr>
+          <th>Date</th><th>ISIN</th>
+          <th class="right">Q(t-1)</th><th class="right">Q(t)</th>
+          <th class="right">Net flow</th><th class="right">Missing</th>
+          <th class="right">Missing % of movement</th><th></th>
+        </tr></thead>
+        <tbody>{detail_rows}</tbody>
+      </table>"""}
+    </div>
+  </div>
+
+</div>
+<div class="footer">Generated by carmignac_diagnostics.py · Carmignac × ENSAE Data Challenge 2025</div>
+
+<script>
+Chart.defaults.color = '#64748b';
+Chart.defaults.borderColor = '#1e2535';
+Chart.defaults.font.family = "'IBM Plex Mono', monospace";
+Chart.defaults.font.size = 11;
+
+const DATES     = {dates_str};
+const N_BROKEN  = {n_broken_js};
+const N_LAG     = {n_lag_js};
+const TOT_MISS  = {total_miss_js};
+const ISIN_TS   = {isin_ts_json};
+const ALL_DATES = {all_dates_str};
+
+function tip() {{
+  return {{
+    backgroundColor:'#0d1117', borderColor:'#252a35', borderWidth:1,
+    titleFont:{{family:"'IBM Plex Mono'"}}, bodyFont:{{family:"'IBM Plex Mono'"}}, padding:10
+  }};
+}}
+function xAxis() {{
+  return {{ type:'category', ticks:{{maxTicksLimit:10,maxRotation:0}},
+            grid:{{color:'#1a2030'}} }};
+}}
+function yAxis(label) {{
+  return {{ grid:{{color:'#1a2030'}},
+            title:{{display:!!label,text:label,color:'#475569'}} }};
+}}
+
+// n_genuine per month = N_BROKEN - N_LAG
+const N_GENUINE = N_BROKEN.map((b,i) => b - (N_LAG[i]||0));
+
+new Chart(document.getElementById('chartTimeline'), {{
+  type:'bar',
+  data:{{
+    labels: DATES,
+    datasets:[
+      {{ label:'Genuine gaps', data:N_GENUINE,
+         backgroundColor:'#ef444488', borderColor:'#ef4444', borderWidth:1, borderRadius:2 }},
+      {{ label:'Likely lags',  data:N_LAG,
+         backgroundColor:'#f59e0b88', borderColor:'#f59e0b', borderWidth:1, borderRadius:2 }},
+    ]
+  }},
+  options:{{
+    responsive:true, maintainAspectRatio:false,
+    interaction:{{mode:'index',intersect:false}},
+    plugins:{{
+      legend:{{position:'top',labels:{{boxWidth:12,padding:16}}}},
+      tooltip:tip()
+    }},
+    scales:{{ x:xAxis(), y:{{...yAxis('# (isin, month) pairs'), stacked:true}} }},
+  }}
+}});
+
+new Chart(document.getElementById('chartMissing'), {{
+  type:'bar',
+  data:{{
+    labels: DATES,
+    datasets:[{{ label:'|Missing flow| (shares)', data:TOT_MISS,
+                 backgroundColor:'#dc262688', borderColor:'#dc2626',
+                 borderWidth:1, borderRadius:2 }}]
+  }},
+  options:{{
+    responsive:true, maintainAspectRatio:false,
+    plugins:{{legend:{{display:false}}, tooltip:tip()}},
+    scales:{{ x:xAxis(), y:yAxis('Shares') }}
+  }}
+}});
+
+new Chart(document.getElementById('chartIsinTs'), {{
+  type:'line',
+  data:{{ labels:ALL_DATES, datasets:ISIN_TS }},
+  options:{{
+    responsive:true, maintainAspectRatio:false,
+    interaction:{{mode:'index',intersect:false}},
+    plugins:{{
+      legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
+      tooltip:tip()
+    }},
+    scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
+  }}
+}});
+
+// ── Aggregate charts ─────────────────────────────────────────
+const AGG_DATES   = {agg_dates_str};
+const AGG_DELTA   = {agg_delta_js};
+const AGG_FLOW    = {agg_flow_js};
+const AGG_MISSING = {agg_missing_js};
+const AGG_PCT     = {agg_pct_js};
+const ALPHA       = {alpha};
+
+// Color each bar: red if broken, amber if lag, else subtle blue
+const aggPctColors = AGG_PCT.map(v =>
+  Math.abs(v) > ALPHA * 100 ? '#ef444488' : '#3b82f622'
+);
+const aggPctBorders = AGG_PCT.map(v =>
+  Math.abs(v) > ALPHA * 100 ? '#ef4444' : '#3b82f655'
+);
+
+// Overlay: ΔAUM vs total flow
+new Chart(document.getElementById('chartAggOverlay'), {{
+  type: 'line',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [
+      {{ label: 'ΔAUM (Σ Q(t) − Σ Q(t−1))',
+         data: AGG_DELTA, borderColor: '#3b82f6', backgroundColor: '#3b82f622',
+         borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
+      {{ label: 'Σ Net flows recorded',
+         data: AGG_FLOW,  borderColor: '#10b981', backgroundColor: '#10b98122',
+         borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
+    ]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    interaction: {{mode:'index', intersect:false}},
+    plugins: {{
+      legend: {{position:'top', labels:{{boxWidth:12, padding:16}}}},
+      tooltip: tip()
+    }},
+    scales: {{ x: xAxis(), y: yAxis('Shares') }}
+  }}
+}});
+
+// Missing flow bar
+new Chart(document.getElementById('chartAggMissing'), {{
+  type: 'bar',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [{{ label: 'Missing flow (shares)', data: AGG_MISSING,
+                  backgroundColor: AGG_MISSING.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
+                  borderColor:     AGG_MISSING.map(v => v < 0 ? '#ef4444'   : '#f59e0b'),
+                  borderWidth: 1, borderRadius: 2 }}]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    plugins: {{legend:{{display:false}}, tooltip: tip()}},
+    scales: {{ x: xAxis(), y: yAxis('Shares') }}
+  }}
+}});
+
+// Missing % bar, coloured by threshold
+new Chart(document.getElementById('chartAggPct'), {{
+  type: 'bar',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [{{ label: 'Missing % of movement', data: AGG_PCT,
+                  backgroundColor: aggPctColors, borderColor: aggPctBorders,
+                  borderWidth: 1, borderRadius: 2 }}]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    plugins: {{
+      legend: {{display:false}},
+      tooltip: tip(),
+      annotation: {{}} // threshold line handled via color
+    }},
+    scales: {{ x: xAxis(), y: {{...yAxis('Missing (%)'), min: 0}} }}
+  }}
+}});
+</script>
+</body>
+</html>"""
+    return html
+
+
+# ─────────────────────────────────────────────────────────────
+# 5. MAIN
+# ─────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Detect broken months in Carmignac AUM/Flows data"
+    )
+    parser.add_argument("--aum",   default="AUM_head.csv")
+    parser.add_argument("--flows", default="flows_head.csv")
+    parser.add_argument("--out",   default="carmignac_broken_months.csv",
+                        help="Machine-readable output (loaded by carmignac_repair.py)")
+    parser.add_argument("--html",  default="carmignac_diagnostics.html")
+    parser.add_argument("--alpha", type=float, default=0.02,
+                        help="Tolerance threshold (default 0.02 = 2%%)")
+    parser.add_argument("--lag",   type=int, default=3,
+                        help="Boundary days to test for accounting lag (default 3)")
+    args = parser.parse_args()
+
+    def resolve(p):
+        if os.path.exists(p): return p
+        alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
+        if os.path.exists(alt): return alt
+        sys.exit(f"[ERROR] File not found: {p}")
+
+    print(f"[Load] AUM   : {args.aum}")
+    print(f"[Load] Flows : {args.flows}")
+    aum, flows = load_data(resolve(args.aum), resolve(args.flows))
+
+    print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
+    df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
+    df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
+
+    print_summary(df_broken, df_all, args.alpha)
+
+    n_agg_broken = int(df_agg["broken"].sum())
+    print(f"  Aggregate broken months      : {n_agg_broken} "
+          f"(of which lags: {int(df_agg['is_lag'].sum())})")
+
+    # CSV output — this is what carmignac_repair.py will load
+    if len(df_broken):
+        df_broken.to_csv(args.out, index=False)
+        print(f"[Export] Broken months CSV → {args.out}")
+    else:
+        pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
+        print(f"[Export] No broken months — empty CSV → {args.out}")
+
+    html = build_html(df_broken, df_all, df_agg, args.alpha)
+    with open(args.html, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"[Export] HTML report       → {args.html}")
+
+
+if __name__ == "__main__":
+    main()
--- a/repair_challenge/carmignac_diagnostics.py
+++ b/repair_challenge/carmignac_diagnostics.py
@ -217,6 +217,107 @@ def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
    df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
    return df_broken, df_all

+# ─────────────────────────────────────────────────────────────
+# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
+# ─────────────────────────────────────────────────────────────
+
+def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
+    """
+    Same stock-flow check as detect_broken_months, but aggregated
+    across ALL ISINs for each month:
+
+        Q_total(t) - Q_total(t-1)  !=  F_total(t)
+
+    where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
+
+    This catches months where the global portfolio is incoherent even
+    if every individual ISIN is fine (e.g. cross-ISIN netting errors),
+    and provides a cleaner high-level view.
+
+    Returns
+    -------
+    df_agg : DataFrame indexed by month with columns:
+        q_total_prev, q_total_curr, delta_aum, flow_total,
+        missing_flow, missing_pct, broken, is_lag
+    """
+    t_min = aum["Centralisation Date"].min()
+    t_max = aum["Centralisation Date"].max()
+    all_months = pd.date_range(t_min, t_max, freq="ME")
+
+    # ── Total AUM per month (all ISIN, all accounts) ─────────────
+    aum_monthly = (
+        aum.groupby("Centralisation Date")["Quantity - AUM"]
+           .sum()
+           .reindex(all_months)
+           .ffill()
+           .rename("q_total")
+    )
+
+    # ── Bucket flows helper (reuse same window logic) ─────────────
+    def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
+        fc = flows_df.copy()
+        def assign_month(d):
+            for m in months:
+                eom_prev = m - pd.offsets.MonthEnd(1)
+                lo = eom_prev - pd.Timedelta(days=lower_offset)
+                hi = m + pd.Timedelta(days=upper_offset)
+                if lo < d <= hi:
+                    return m
+            return pd.NaT
+        fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
+        fc = fc.dropna(subset=["month_end"])
+        return (fc.groupby("month_end")["Quantity - NetFlows"]
+                  .sum()
+                  .reindex(months)
+                  .fillna(0.0))
+
+    flow_std = bucket_total_flows(flows, all_months)
+    flow_lag = bucket_total_flows(flows, all_months,
+                                  lower_offset=lag_days, upper_offset=lag_days)
+
+    # ── Compute residuals ─────────────────────────────────────────
+    rows = []
+    min_abs_shares = 1.0
+
+    for i in range(1, len(all_months)):
+        t_curr = all_months[i]
+        t_prev = all_months[i - 1]
+
+        q_curr = aum_monthly.get(t_curr, np.nan)
+        q_prev = aum_monthly.get(t_prev, np.nan)
+        if pd.isna(q_curr) or pd.isna(q_prev):
+            continue
+
+        delta = q_curr - q_prev
+
+        f_std     = flow_std.get(t_curr, 0.0)
+        f_lag     = flow_lag.get(t_curr, 0.0)
+        miss_std  = delta - f_std
+        miss_lag  = delta - f_lag
+
+        movement_std = max(abs(delta), abs(f_std), min_abs_shares)
+        movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
+        pct_std = abs(miss_std) / movement_std
+        pct_lag = abs(miss_lag) / movement_lag
+
+        broken_std = pct_std > alpha
+        broken_lag = pct_lag > alpha
+        is_lag     = broken_std and (not broken_lag)
+
+        rows.append({
+            "date":          t_curr,
+            "q_total_prev":  round(q_prev, 3),
+            "q_total_curr":  round(q_curr, 3),
+            "delta_aum":     round(delta, 3),
+            "flow_total":    round(f_std, 3),
+            "missing_flow":  round(miss_std, 3),
+            "missing_pct":   round(pct_std, 6),
+            "broken":        broken_std,
+            "is_lag":        is_lag,
+        })
+
+    df_agg = pd.DataFrame(rows)
+    return df_agg

 # ─────────────────────────────────────────────────────────────
 # 3. PRINT SUMMARY
@ -257,7 +358,7 @@ def print_summary(df_broken, df_all, alpha):
 # 4. BUILD HTML REPORT
 # ─────────────────────────────────────────────────────────────

-def build_html(df_broken, df_all, alpha):
+def build_html(df_broken, df_all, df_agg, alpha):
    # ── JS-ready data ────────────────────────────────────────────
    # Timeline: n_broken and total_missing per month
    tl = (df_all[df_all["broken"]]
@ -277,6 +378,40 @@ def build_html(df_broken, df_all, alpha):
    total_miss_js   = jf(tl["total_missing"].values)
    n_lag_js        = jf(tl["n_lag"].values, 0)

+    # Aggregate (cross-ISIN) JS data
+    agg_dates_str  = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])])
+    agg_delta_js   = jf(df_agg["delta_aum"].values)
+    agg_flow_js    = jf(df_agg["flow_total"].values)
+    agg_missing_js = jf(df_agg["missing_flow"].values)
+    agg_pct_js     = jf((df_agg["missing_pct"] * 100).values)
+
+    # Aggregate KPIs
+    n_agg_broken = int(df_agg["broken"].sum())
+    n_agg_lag    = int(df_agg["is_lag"].sum())
+    n_agg_genuine = n_agg_broken - n_agg_lag
+    max_agg_pct  = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0
+
+    # Aggregate detail table rows
+    agg_rows = []
+    for _, r in df_agg[df_agg["broken"]].iterrows():
+        lb  = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
+        pc  = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
+        ds  = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
+        mc  = "miss-neg" if r["missing_flow"] < 0 else "miss-pos"
+        agg_rows.append(
+            f'<tr><td>{ds}</td>'
+            f'<td class="mono right">{r["q_total_prev"]:,.1f}</td>'
+            f'<td class="mono right">{r["q_total_curr"]:,.1f}</td>'
+            f'<td class="mono right">{r["flow_total"]:,.1f}</td>'
+            f'<td class="mono right {mc}">{r["missing_flow"]:+,.1f}</td>'
+            f'<td class="mono right {pc}">{r["missing_pct"]*100:.2f}%</td>'
+            f'<td>{lb}</td></tr>'
+        )
+    agg_detail_rows = "".join(agg_rows) if agg_rows else (
+        '<tr><td colspan="7" style="padding:24px;text-align:center;'
+        'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
+    )
+
    # Per-ISIN summary
    isin_sum = (df_broken.groupby("isin")
                         .agg(n_months=("date", "count"),
@ -483,7 +618,60 @@ def build_html(df_broken, df_all, alpha):

 <div class="main">

-  <div class="section-label">01 · Timeline</div>
+  <div class="section-label">00 · Aggregate view — all ISINs combined</div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">Stock-flow equation — total portfolio</span>
+      <span class="card-desc">
+        Σ Q(t) − Σ Q(t−1) vs Σ F(t) across all ISINs and accounts.
+        Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
+      </span>
+    </div>
+    <div class="card-body">
+      <div class="chart-wrap-tall"><canvas id="chartAggOverlay"></canvas></div>
+    </div>
+  </div>
+
+  <div class="grid-2">
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Aggregate missing flow over time</span>
+        <span class="card-desc">Σ Q(t) − Σ Q(t−1) − Σ F(t) — should be near zero every month</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartAggMissing"></canvas></div>
+      </div>
+    </div>
+    <div class="card">
+      <div class="card-header">
+        <span class="card-title">Aggregate missing % of movement</span>
+        <span class="card-desc">|missing| / max(|ΔAUM|, |flow|) — months above α flagged in red</span>
+      </div>
+      <div class="card-body">
+        <div class="chart-wrap"><canvas id="chartAggPct"></canvas></div>
+      </div>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-header">
+      <span class="card-title">Aggregate broken months — detail</span>
+    </div>
+    <div class="card-body" style="padding:0">
+      <table>
+        <thead><tr>
+          <th>Date</th>
+          <th class="right">Σ Q(t−1)</th><th class="right">Σ Q(t)</th>
+          <th class="right">Σ Flow</th><th class="right">Missing</th>
+          <th class="right">Missing %</th><th></th>
+        </tr></thead>
+        <tbody>{agg_detail_rows}</tbody>
+      </table>
+    </div>
+  </div>
+
+  <div class="section-label">01 · Timeline — per ISIN</div>

  <div class="card">
    <div class="card-header">
@ -643,6 +831,84 @@ new Chart(document.getElementById('chartIsinTs'), {{
    scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
  }}
 }});
+
+// ── Aggregate charts ─────────────────────────────────────────
+const AGG_DATES   = {agg_dates_str};
+const AGG_DELTA   = {agg_delta_js};
+const AGG_FLOW    = {agg_flow_js};
+const AGG_MISSING = {agg_missing_js};
+const AGG_PCT     = {agg_pct_js};
+const ALPHA       = {alpha};
+
+// Color each bar: red if broken, amber if lag, else subtle blue
+const aggPctColors = AGG_PCT.map(v =>
+  Math.abs(v) > ALPHA * 100 ? '#ef444488' : '#3b82f622'
+);
+const aggPctBorders = AGG_PCT.map(v =>
+  Math.abs(v) > ALPHA * 100 ? '#ef4444' : '#3b82f655'
+);
+
+// Overlay: ΔAUM vs total flow
+new Chart(document.getElementById('chartAggOverlay'), {{
+  type: 'line',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [
+      {{ label: 'ΔAUM (Σ Q(t) − Σ Q(t−1))',
+         data: AGG_DELTA, borderColor: '#3b82f6', backgroundColor: '#3b82f622',
+         borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
+      {{ label: 'Σ Net flows recorded',
+         data: AGG_FLOW,  borderColor: '#10b981', backgroundColor: '#10b98122',
+         borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
+    ]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    interaction: {{mode:'index', intersect:false}},
+    plugins: {{
+      legend: {{position:'top', labels:{{boxWidth:12, padding:16}}}},
+      tooltip: tip()
+    }},
+    scales: {{ x: xAxis(), y: yAxis('Shares') }}
+  }}
+}});
+
+// Missing flow bar
+new Chart(document.getElementById('chartAggMissing'), {{
+  type: 'bar',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [{{ label: 'Missing flow (shares)', data: AGG_MISSING,
+                  backgroundColor: AGG_MISSING.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
+                  borderColor:     AGG_MISSING.map(v => v < 0 ? '#ef4444'   : '#f59e0b'),
+                  borderWidth: 1, borderRadius: 2 }}]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    plugins: {{legend:{{display:false}}, tooltip: tip()}},
+    scales: {{ x: xAxis(), y: yAxis('Shares') }}
+  }}
+}});
+
+// Missing % bar, coloured by threshold
+new Chart(document.getElementById('chartAggPct'), {{
+  type: 'bar',
+  data: {{
+    labels: AGG_DATES,
+    datasets: [{{ label: 'Missing % of movement', data: AGG_PCT,
+                  backgroundColor: aggPctColors, borderColor: aggPctBorders,
+                  borderWidth: 1, borderRadius: 2 }}]
+  }},
+  options: {{
+    responsive: true, maintainAspectRatio: false,
+    plugins: {{
+      legend: {{display:false}},
+      tooltip: tip(),
+      annotation: {{}} // threshold line handled via color
+    }},
+    scales: {{ x: xAxis(), y: {{...yAxis('Missing (%)'), min: 0}} }}
+  }}
+}});
 </script>
 </body>
 </html>"""
@ -680,19 +946,23 @@ def main():

    print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
    df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
+    df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)

    print_summary(df_broken, df_all, args.alpha)

+    n_agg_broken = int(df_agg["broken"].sum())
+    print(f"  Aggregate broken months      : {n_agg_broken} "
+          f"(of which lags: {int(df_agg['is_lag'].sum())})")
+
    # CSV output — this is what carmignac_repair.py will load
    if len(df_broken):
        df_broken.to_csv(args.out, index=False)
        print(f"[Export] Broken months CSV → {args.out}")
    else:
-        # Write empty file so repair pipeline can always try to load it
        pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(args.out, index=False)
        print(f"[Export] No broken months — empty CSV → {args.out}")

-    html = build_html(df_broken, df_all, args.alpha)
+    html = build_html(df_broken, df_all, df_agg, args.alpha)
    with open(args.html, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"[Export] HTML report       → {args.html}")
--- a/repair_challenge/carmignac_repair.py
+++ b/repair_challenge/carmignac_repair.py
@ -15,16 +15,54 @@ import s3fs
 # ─────────────────────────────────────────────
 # PARAMÈTRES
 # ─────────────────────────────────────────────
-ALPHA = 0.03          # tolérance réconciliation : 5% du stock à t
+ALPHA = 0.05          # tolérance réconciliation : 5% du stock à t
 MIN_AUM_EUR = 5e6       # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
 MIN_JACCARD = 0.3      # seuil minimal similarité portefeuille pour chirurgie
-SCORE_DROP_THRESHOLD = 0.1  # si score chute de >10% → candidat chirurgie
+SCORE_DROP_THRESHOLD = 0.15  # si score chute de >15% → candidat chirurgie
+
+# ── Broken months ──────────────────────────────────────────────
+# Attenuation factor applied to reconciliation errors on months flagged
+# as "broken" by carmignac_diagnostics.py.  On a broken month the error
+# is multiplied by this factor before degrading the score, so a genuine
+# data-quality problem at market level does not unfairly penalise an
+# account.  Set to 1.0 to disable attenuation.
+BROKEN_MONTH_ATTENUATION = 0.2   # reduce error to 20% on broken months
+
+# ── Accounting lag window ──────────────────────────────────────
+# Transactions dated within this many days of a month-end boundary are
+# considered "boundary" flows.  When the standard-window reconciliation
+# fails but the lag-adjusted reconciliation passes, the error is
+# attenuated (same factor as broken months).
+LAG_ATTENUATION = 0.2   # reduce error to 20% on likely lag months

 EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]

 # ─────────────────────────────────────────────
 # 1. CHARGEMENT
 # ─────────────────────────────────────────────
+def load_broken_months(broken_months_path):
+    """
+    Loads the broken-months CSV produced by carmignac_diagnostics.py.
+    Returns a set of (date, isin) tuples flagged as broken, and a
+    separate set flagged as likely accounting lags.
+
+    If the file does not exist or is empty, returns two empty sets.
+    """
+    if not broken_months_path or not os.path.exists(broken_months_path):
+        return set(), set()
+    try:
+        df = pd.read_csv(broken_months_path, parse_dates=["date"])
+        broken = set(zip(pd.to_datetime(df["date"]), df["isin"].astype(str)))
+        lags = set(zip(pd.to_datetime(df.loc[df["is_lag"], "date"]),
+                         df.loc[df["is_lag"], "isin"].astype(str)))
+        print(f"[Broken months] Loaded {len(broken)} flagged (isin, month) pairs "
+              f"({len(lags)} likely lags)")
+        return broken, lags
+    except Exception as e:
+        print(f"[Broken months] Could not load '{broken_months_path}': {e}")
+        return set(), set()
+
+
 def load_data():
    fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
@ -147,28 +185,44 @@ def build_monthly_panel(aum, universe, t_ref):
 # ─────────────────────────────────────────────
 # 4. FLOWS AGRÉGÉS PAR MOIS
 # ─────────────────────────────────────────────
-def aggregate_flows_monthly(flows, all_months):
+def aggregate_flows_monthly(flows, all_months, lag_days=3):
    """
    Agrège les flows infra-mensuels sur chaque fenêtre ]fin_mois(t-1), fin_mois(t)].
-    Retourne un DataFrame indexé par (fin_mois, reg_id, isin).
+    Retourne deux DataFrames indexés par (fin_mois, reg_id, isin) :
+      - monthly_flows      : agrégation standard (fenêtre exacte)
+      - monthly_flows_lag  : agrégation avec fenêtre élargie de ±lag_days jours
+                             autour de chaque fin de mois.  Utilisé pour détecter
+                             les ruptures dues à un décalage comptable de fin de mois.
    """
    flows_f = flows[flows['date'] <= all_months[-1]].copy()

-    # Associer chaque transaction à la fin de mois correspondante
-    # = la première fin de mois >= date de transaction
-    flows_f['month_end'] = flows_f['date'].apply(
-        lambda d: all_months[all_months >= d][0] if any(all_months >= d) else pd.NaT
-    )
-    flows_f = flows_f.dropna(subset=['month_end'])
+    def assign_month(d, lower_offset=0, upper_offset=0):
+        for m in all_months:
+            eom_prev = m - pd.offsets.MonthEnd(1)
+            lo = eom_prev - pd.Timedelta(days=lower_offset)
+            hi = m + pd.Timedelta(days=upper_offset)
+            if lo < d <= hi:
+                return m
+        return pd.NaT

-    # Agrégation
-    monthly_flows = flows_f.groupby(['month_end', 'reg_id', 'isin'])['qty_net'].sum()
-    monthly_flows = monthly_flows.reset_index()
+    # Standard window
+    flows_f['month_end'] = flows_f['date'].apply(lambda d: assign_month(d))
+    flows_std = flows_f.dropna(subset=['month_end'])
+    monthly_flows = flows_std.groupby(['month_end', 'reg_id', 'isin'])['qty_net'].sum().reset_index()
    monthly_flows.columns = ['date', 'reg_id', 'isin', 'qty_net_month']

-    print(f"\n[Flows mensuels] {len(monthly_flows)} enregistrements (reg_id, isin, mois)")
+    # Lag window (±lag_days around each EOM)
+    flows_f2 = flows[flows['date'] <= all_months[-1]].copy()
+    flows_f2['month_end'] = flows_f2['date'].apply(
+        lambda d: assign_month(d, lower_offset=lag_days, upper_offset=lag_days))
+    flows_lag = flows_f2.dropna(subset=['month_end'])
+    monthly_flows_lag = flows_lag.groupby(['month_end', 'reg_id', 'isin'])['qty_net'].sum().reset_index()
+    monthly_flows_lag.columns = ['date', 'reg_id', 'isin', 'qty_net_month']

-    return monthly_flows
+    print(f"\n[Flows mensuels] {len(monthly_flows)} enregistrements (standard) | "
+          f"{len(monthly_flows_lag)} (lag window ±{lag_days}d)")
+
+    return monthly_flows, monthly_flows_lag

 # ─────────────────────────────────────────────
 # 5. ÉTAPE 2 — Score de cohérence temporelle
@ -189,19 +243,27 @@ def compute_reconciliation_error(qty_t_minus1, qty_t, net_flow, alpha=ALPHA):
    err_ratio = err / denom
    return err_ratio, err_ratio > alpha

-def score_propagation(panel, monthly_flows, weights, universe, all_months):
+def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe,
+                      all_months, broken_months=None, lag_months=None):
    """
    Propage les scores de t_ref vers t=0 (passé).

    À chaque mois t (en remontant), pour chaque reg_id dans l'univers courant :
      - Calculer l'erreur de réconciliation pondérée par ISIN
      - Dégrader le score proportionnellement
+      - Atténuer l'erreur si le mois est flagué comme "broken" ou "lag"
+
+    broken_months : set of (date, isin) pairs flagged as broken by diagnostics
+    lag_months    : subset of broken_months identified as likely accounting lags

    Retourne :
      - scores_history : dict {date → {reg_id → score}}
      - errors_history : dict {date → {reg_id → err_pondérée}}
      - mapping        : dict {reg_id_original → reg_id_courant} (après chirurgie)
    """
+    broken_months = broken_months or set()
+    lag_months    = lag_months    or set()
+
    # Initialisation
    scores = dict(weights)  # scores à t_ref
    scores_history = {all_months[-1]: dict(scores)}
@ -211,7 +273,8 @@ def score_propagation(panel, monthly_flows, weights, universe, all_months):
    mapping = {r: r for r in universe}

    # Flows indexés pour accès rapide
-    flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
+    flows_idx     = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
+    flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']

    # Remonter dans le temps
    for i in range(len(all_months) - 2, -1, -1):
@ -272,6 +335,28 @@ def score_propagation(panel, monthly_flows, weights, universe, all_months):
                    qty_t_prev, qty_t, net_flow, alpha=ALPHA
                )

+                # ── Attenuation on broken / lag months ──────────────
+                # If this (isin, month) is flagged as broken at market
+                # level, the error is not the account's fault — attenuate.
+                if err_ratio > 0:
+                    key = (t_curr, isin)
+                    if key in broken_months or key in lag_months:
+                        # Try lag-window flow to distinguish lag vs genuine gap
+                        try:
+                            net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
+                        except KeyError:
+                            net_flow_lag = net_flow
+                        err_lag, _ = compute_reconciliation_error(
+                            qty_t_prev, qty_t, net_flow_lag, alpha=ALPHA
+                        )
+                        # Use whichever flow window gives the smaller error,
+                        # then attenuate the result
+                        best_err = min(err_ratio, err_lag)
+                        attenuation = (BROKEN_MONTH_ATTENUATION
+                                       if key in broken_months
+                                       else LAG_ATTENUATION)
+                        err_ratio = best_err * attenuation
+
                # Pondération par AUM à t_curr
                weight_isin = abs(qty_t)
                weighted_err += err_ratio * weight_isin
@ -434,7 +519,8 @@ def _recompute_score_with_candidate(reg_orig, candidate, t_prev, t_curr,


 def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
-                     weights, universe, all_months):
+                     monthly_flows_lag, weights, universe, all_months,
+                     broken_months=None, lag_months=None):
    """
    Deuxième passe : pour chaque mois avec des ruptures fortes,
    tente une chirurgie de code et recalcule les scores.
@ -449,7 +535,8 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
      - surgery_log     : liste des opérations effectuées
      - scores_final    : scores au dernier mois
    """
-    flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
+    flows_idx     = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
+    flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']

    # Tous les reg_ids présents dans le panel (univers + codes historiques)
    all_regs_in_panel = set(panel.columns.get_level_values(0))
@ -623,7 +710,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
 # ─────────────────────────────────────────────
 # 8. PIPELINE PRINCIPAL
 # ─────────────────────────────────────────────
-def run_pipeline():
+def run_pipeline(broken_months_path=None):
    print("=" * 60)
    print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
    print("=" * 60)
@ -631,6 +718,9 @@ def run_pipeline():
    # Chargement
    aum, flows = load_data()

+    # Broken months (optional — produced by carmignac_diagnostics.py)
+    broken_months, lag_months = load_broken_months(broken_months_path)
+
    # Étape 1 — Univers de référence
    aum_ref, weights, universe, t_ref = build_reference_universe(aum)

@ -641,20 +731,22 @@ def run_pipeline():
    # Panel mensuel
    panel, all_months = build_monthly_panel(aum, universe, t_ref)

-    # Flows mensuels agrégés
-    monthly_flows = aggregate_flows_monthly(flows, all_months)
+    # Flows mensuels agrégés (standard + lag window)
+    monthly_flows, monthly_flows_lag = aggregate_flows_monthly(flows, all_months)

    # Étape 2 — Score de cohérence (sans chirurgie)
    print("\n[Étape 2] Propagation des scores (sans chirurgie)...")
    scores_history, errors_history, _ = score_propagation(
-        panel, monthly_flows, weights, universe, all_months
+        panel, monthly_flows, monthly_flows_lag, weights, universe, all_months,
+        broken_months=broken_months, lag_months=lag_months
    )

    # Étape 3 — Chirurgie
    print("\n[Étape 3] Passe de chirurgie...")
    mapping_history, surgery_log, final_scores, scores_history_corrected = run_surgery_pass(
-        scores_history, errors_history, panel, monthly_flows,
-        weights, universe, all_months
+        scores_history, errors_history, panel, monthly_flows, monthly_flows_lag,
+        weights, universe, all_months,
+        broken_months=broken_months, lag_months=lag_months
    )

    # Export — on utilise les scores corrigés (post-chirurgie) comme référence
@ -663,6 +755,7 @@ def run_pipeline():
        scores_history_corrected, mapping_history, surgery_log, all_months
    )

+
    # Résumé final
    print("\n" + "=" * 60)
    print("RÉSUMÉ FINAL")
@ -679,4 +772,6 @@ def run_pipeline():


 if __name__ == "__main__":
-    df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline()
+    df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
+        broken_months_path="carmignac_broken_months.csv"  # optional
+    )
--- a/repair_challenge/carmignac_repair_.py
+++ b/repair_challenge/carmignac_repair_.py
@ -1,777 +0,0 @@
-"""
-Carmignac Data Challenge — Registrar ID Repair Pipeline
-=========================================================
-Étape 1 : Filtrage & univers de référence à t=31/10/2025
-Étape 2 : Score de cohérence temporelle (propagation vers le passé)
-Étape 3 : Chirurgie de code (matching 1-to-1)
-"""
-
-import pandas as pd
-import numpy as np
-from collections import defaultdict
-import os 
-import s3fs
-
-# ─────────────────────────────────────────────
-# PARAMÈTRES
-# ─────────────────────────────────────────────
-ALPHA = 0.05          # tolérance réconciliation : 5% du stock à t
-MIN_AUM_EUR = 5e6       # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
-MIN_JACCARD = 0.3      # seuil minimal similarité portefeuille pour chirurgie
-SCORE_DROP_THRESHOLD = 0.15  # si score chute de >15% → candidat chirurgie
-
-# ── Broken months ──────────────────────────────────────────────
-# Attenuation factor applied to reconciliation errors on months flagged
-# as "broken" by carmignac_diagnostics.py.  On a broken month the error
-# is multiplied by this factor before degrading the score, so a genuine
-# data-quality problem at market level does not unfairly penalise an
-# account.  Set to 1.0 to disable attenuation.
-BROKEN_MONTH_ATTENUATION = 0.2   # reduce error to 20% on broken months
-
-# ── Accounting lag window ──────────────────────────────────────
-# Transactions dated within this many days of a month-end boundary are
-# considered "boundary" flows.  When the standard-window reconciliation
-# fails but the lag-adjusted reconciliation passes, the error is
-# attenuated (same factor as broken months).
-LAG_ATTENUATION = 0.2   # reduce error to 20% on likely lag months
-
-EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
-
-# ─────────────────────────────────────────────
-# 1. CHARGEMENT
-# ─────────────────────────────────────────────
-def load_broken_months(broken_months_path):
-    """
-    Loads the broken-months CSV produced by carmignac_diagnostics.py.
-    Returns a set of (date, isin) tuples flagged as broken, and a
-    separate set flagged as likely accounting lags.
-
-    If the file does not exist or is empty, returns two empty sets.
-    """
-    if not broken_months_path or not os.path.exists(broken_months_path):
-        return set(), set()
-    try:
-        df = pd.read_csv(broken_months_path, parse_dates=["date"])
-        broken = set(zip(pd.to_datetime(df["date"]), df["isin"].astype(str)))
-        lags = set(zip(pd.to_datetime(df.loc[df["is_lag"], "date"]),
-                         df.loc[df["is_lag"], "isin"].astype(str)))
-        print(f"[Broken months] Loaded {len(broken)} flagged (isin, month) pairs "
-              f"({len(lags)} likely lags)")
-        return broken, lags
-    except Exception as e:
-        print(f"[Broken months] Could not load '{broken_months_path}': {e}")
-        return set(), set()
-
-
-def load_data():
-    fs = s3fs.S3FileSystem(
-    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
-    key = os.environ["AWS_ACCESS_KEY_ID"], 
-    secret = os.environ["AWS_SECRET_ACCESS_KEY"], 
-    token = os.environ["AWS_SESSION_TOKEN"])
-
-    with fs.open('projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv', 'rb') as f:
-        flows = pd.read_csv(f, sep=";")
-
-    with fs.open('projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:
-        aum = pd.read_csv(f, sep=";")
-
-    aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
-    flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
-
-    # Noms courts
-    aum = aum.rename(columns={
-        'Registrar Account - ID': 'reg_id',
-        'Product - Isin': 'isin',
-        'Centralisation Date': 'date',
-        'Quantity - AUM': 'qty_aum',
-        'Value - AUM €': 'val_eur',
-        'Registrar Account - Region': 'region',
-    })
-    flows = flows.rename(columns={
-        'Registrar Account - ID': 'reg_id',
-        'Product - Isin': 'isin',
-        'Centralisation Date': 'date',
-        'Quantity - NetFlows': 'qty_net',
-        'Value € - NetFlows': 'val_net_eur',
-    })
-
-    aum['reg_id'] = aum['reg_id'].astype(str)
-    flows['reg_id'] = flows['reg_id'].astype(str)
-
-    return aum, flows
-
-# ─────────────────────────────────────────────
-# 2. ÉTAPE 1 — Univers de référence à T_REF
-# ─────────────────────────────────────────────
-def build_reference_universe(aum, t_ref=None):
-    """
-    Construit l'univers de référence à t_ref (dernière date par défaut).
-    Retourne :
-      - aum_ref  : AUM à t_ref pour chaque (reg_id, isin)
-      - weights  : poids normalisé par reg_id
-      - universe : ensemble des reg_id retenus (>= MIN_AUM_EUR)
-    """
-    if t_ref is None:
-        t_ref = aum['date'].max()
-
-    print(f"\n[Étape 1] Date de référence : {t_ref.date()}")
-
-    # Exclure Off Distribution / Private Clients (sur région ou nom)
-    mask_excl = aum['reg_id'].isin(EXCLUDE_REGISTRAR)
-    if 'region' in aum.columns:
-        mask_excl |= aum['region'].isin(EXCLUDE_REGISTRAR)
-    aum_clean = aum[~mask_excl].copy()
-
-    # AUM à t_ref
-    aum_ref = aum_clean[aum_clean['date'] == t_ref][['reg_id', 'isin', 'qty_aum', 'val_eur']].copy()
-
-    # AUM total par reg_id à t_ref
-    aum_by_reg = aum_ref.groupby('reg_id')['val_eur'].sum().rename('total_eur')
-
-    # Filtrage >= MIN_AUM_EUR
-    universe = set(aum_by_reg[aum_by_reg >= MIN_AUM_EUR].index)
-
-    total_eur_universe = aum_by_reg[aum_by_reg.index.isin(universe)].sum()
-    total_eur_all = aum_by_reg.sum()
-    coverage = total_eur_universe / total_eur_all if total_eur_all > 0 else 0
-
-    print(f"  Registrar IDs à t_ref          : {len(aum_by_reg)}")
-    print(f"  Dont >= {MIN_AUM_EUR/1e6:.0f}M€                : {len(universe)}")
-    print(f"  Couverture encours              : {coverage:.1%}")
-
-    # Poids initiaux (scores à t_ref)
-    weights = (aum_by_reg[aum_by_reg.index.isin(universe)] / total_eur_universe).to_dict()
-
-    return aum_ref, weights, universe, t_ref
-
-# ─────────────────────────────────────────────
-# 3. PANEL AUM MENSUEL (forward-fill)
-# ─────────────────────────────────────────────
-def build_monthly_panel(aum, universe, t_ref):
-    """
-    Construit un panel mensuel complet (forward-fill des quantités AUM)
-    pour TOUS les reg_ids présents dans l'historique AUM — y compris les codes
-    historiques hors univers de référence, nécessaires pour la chirurgie.
-    """
-    # Toutes les fin de mois entre la première date et t_ref
-    date_min = aum['date'].min()
-    all_months = pd.date_range(start=date_min, end=t_ref, freq='ME')
-
-    # Pivot : (reg_id, isin) → série temporelle de qty_aum
-    aum_sorted = aum.sort_values(['reg_id', 'isin', 'date'])
-
-    # On ne garde que les lignes jusqu'à t_ref
-    aum_sorted = aum_sorted[aum_sorted['date'] <= t_ref]
-
-    # Multi-index pivot
-    panel = aum_sorted.pivot_table(
-        index='date', columns=['reg_id', 'isin'], values='qty_aum', aggfunc='last'
-    )
-
-    # Réindexer sur toutes les fins de mois
-    panel = panel.reindex(all_months)
-
-    # Forward-fill : si pas de mouvement, la quantité reste la même
-    panel = panel.ffill()
-
-    # Backward-fill initial pour les comptes qui démarrent après la première date
-    # (on ne remonte pas avant leur première apparition → on garde NaN)
-
-    print(f"\n[Panel mensuel] {len(all_months)} mois, {panel.shape[1]} (reg_id, isin) paires")
-
-    return panel, all_months
-
-# ─────────────────────────────────────────────
-# 4. FLOWS AGRÉGÉS PAR MOIS
-# ─────────────────────────────────────────────
-def aggregate_flows_monthly(flows, all_months, lag_days=3):
-    """
-    Agrège les flows infra-mensuels sur chaque fenêtre ]fin_mois(t-1), fin_mois(t)].
-    Retourne deux DataFrames indexés par (fin_mois, reg_id, isin) :
-      - monthly_flows      : agrégation standard (fenêtre exacte)
-      - monthly_flows_lag  : agrégation avec fenêtre élargie de ±lag_days jours
-                             autour de chaque fin de mois.  Utilisé pour détecter
-                             les ruptures dues à un décalage comptable de fin de mois.
-    """
-    flows_f = flows[flows['date'] <= all_months[-1]].copy()
-
-    def assign_month(d, lower_offset=0, upper_offset=0):
-        for m in all_months:
-            eom_prev = m - pd.offsets.MonthEnd(1)
-            lo = eom_prev - pd.Timedelta(days=lower_offset)
-            hi = m + pd.Timedelta(days=upper_offset)
-            if lo < d <= hi:
-                return m
-        return pd.NaT
-
-    # Standard window
-    flows_f['month_end'] = flows_f['date'].apply(lambda d: assign_month(d))
-    flows_std = flows_f.dropna(subset=['month_end'])
-    monthly_flows = flows_std.groupby(['month_end', 'reg_id', 'isin'])['qty_net'].sum().reset_index()
-    monthly_flows.columns = ['date', 'reg_id', 'isin', 'qty_net_month']
-
-    # Lag window (±lag_days around each EOM)
-    flows_f2 = flows[flows['date'] <= all_months[-1]].copy()
-    flows_f2['month_end'] = flows_f2['date'].apply(
-        lambda d: assign_month(d, lower_offset=lag_days, upper_offset=lag_days))
-    flows_lag = flows_f2.dropna(subset=['month_end'])
-    monthly_flows_lag = flows_lag.groupby(['month_end', 'reg_id', 'isin'])['qty_net'].sum().reset_index()
-    monthly_flows_lag.columns = ['date', 'reg_id', 'isin', 'qty_net_month']
-
-    print(f"\n[Flows mensuels] {len(monthly_flows)} enregistrements (standard) | "
-          f"{len(monthly_flows_lag)} (lag window ±{lag_days}d)")
-
-    return monthly_flows, monthly_flows_lag
-
-# ─────────────────────────────────────────────
-# 5. ÉTAPE 2 — Score de cohérence temporelle
-# ─────────────────────────────────────────────
-def compute_reconciliation_error(qty_t_minus1, qty_t, net_flow, alpha=ALPHA):
-    """
-    Calcule l'erreur de réconciliation normalisée pour un (reg_id, isin, mois).
-
-    Attendu : qty_t_minus1 + net_flow ≈ qty_t
-    Erreur   : |qty_t_minus1 + net_flow - qty_t| / max(|qty_t|, |qty_t_minus1|)
-
-    Retourne :
-      - err_ratio  : erreur relative (0 = parfait)
-      - is_break   : True si err_ratio > alpha
-    """
-    denom = max(abs(qty_t), abs(qty_t_minus1), 1e-9)
-    err = abs(qty_t_minus1 + net_flow - qty_t)
-    err_ratio = err / denom
-    return err_ratio, err_ratio > alpha
-
-def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe,
-                      all_months, broken_months=None, lag_months=None):
-    """
-    Propage les scores de t_ref vers t=0 (passé).
-
-    À chaque mois t (en remontant), pour chaque reg_id dans l'univers courant :
-      - Calculer l'erreur de réconciliation pondérée par ISIN
-      - Dégrader le score proportionnellement
-      - Atténuer l'erreur si le mois est flagué comme "broken" ou "lag"
-
-    broken_months : set of (date, isin) pairs flagged as broken by diagnostics
-    lag_months    : subset of broken_months identified as likely accounting lags
-
-    Retourne :
-      - scores_history : dict {date → {reg_id → score}}
-      - errors_history : dict {date → {reg_id → err_pondérée}}
-      - mapping        : dict {reg_id_original → reg_id_courant} (après chirurgie)
-    """
-    broken_months = broken_months or set()
-    lag_months    = lag_months    or set()
-
-    # Initialisation
-    scores = dict(weights)  # scores à t_ref
-    scores_history = {all_months[-1]: dict(scores)}
-    errors_history = {}
-
-    # Mapping actuel (identité au départ)
-    mapping = {r: r for r in universe}
-
-    # Flows indexés pour accès rapide
-    flows_idx     = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
-    flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
-
-    # Remonter dans le temps
-    for i in range(len(all_months) - 2, -1, -1):
-        t_prev = all_months[i]
-        t_curr = all_months[i + 1]
-
-        errors_at_t = {}
-        new_scores = {}
-
-        for reg_orig, reg_curr in mapping.items():
-            score_curr = scores.get(reg_orig, 0)
-            if score_curr == 0:
-                new_scores[reg_orig] = 0
-                continue
-
-            # ISIN détenus par ce reg à t_curr (après mapping)
-            if reg_curr in panel.columns.get_level_values(0):
-                isin_list = panel[reg_curr].columns.tolist()
-            else:
-                # reg_curr n'existe pas du tout dans le panel → rupture totale
-                new_scores[reg_orig] = 0
-                errors_at_t[reg_orig] = 1.0
-                continue
-
-            total_aum_t = 0
-            weighted_err = 0
-            valid_isin_count = 0
-            all_nan_at_prev = True  # détecte si le compte n'existait pas à t_prev
-
-            for isin in isin_list:
-                qty_t = panel[reg_curr][isin].get(t_curr, np.nan)
-                qty_t_prev = panel[reg_curr][isin].get(t_prev, np.nan)
-
-                if pd.isna(qty_t):
-                    continue
-
-                if not pd.isna(qty_t_prev):
-                    all_nan_at_prev = False
-
-                if pd.isna(qty_t_prev):
-                    # ISIN existait à t_curr mais pas à t_prev → rupture sur cet ISIN
-                    # On le traite comme une erreur maximale pondérée par son AUM
-                    weight_isin = abs(qty_t)
-                    weighted_err += 1.0 * weight_isin
-                    total_aum_t += weight_isin
-                    valid_isin_count += 1
-                    continue
-
-                if qty_t == 0 and qty_t_prev == 0:
-                    continue
-                # Flow agrégé sur ]t_prev, t_curr]
-                try:
-                    net_flow = flows_idx.loc[(t_curr, reg_curr, isin)]
-                except KeyError:
-                    net_flow = 0.0
-
-                err_ratio, is_break = compute_reconciliation_error(
-                    qty_t_prev, qty_t, net_flow, alpha=ALPHA
-                )
-
-                # ── Attenuation on broken / lag months ──────────────
-                # If this (isin, month) is flagged as broken at market
-                # level, the error is not the account's fault — attenuate.
-                if err_ratio > 0:
-                    key = (t_curr, isin)
-                    if key in broken_months or key in lag_months:
-                        # Try lag-window flow to distinguish lag vs genuine gap
-                        try:
-                            net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
-                        except KeyError:
-                            net_flow_lag = net_flow
-                        err_lag, _ = compute_reconciliation_error(
-                            qty_t_prev, qty_t, net_flow_lag, alpha=ALPHA
-                        )
-                        # Use whichever flow window gives the smaller error,
-                        # then attenuate the result
-                        best_err = min(err_ratio, err_lag)
-                        attenuation = (BROKEN_MONTH_ATTENUATION
-                                       if key in broken_months
-                                       else LAG_ATTENUATION)
-                        err_ratio = best_err * attenuation
-
-                # Pondération par AUM à t_curr
-                weight_isin = abs(qty_t)
-                weighted_err += err_ratio * weight_isin
-                total_aum_t += weight_isin
-                valid_isin_count += 1
-
-            if total_aum_t > 0 and valid_isin_count > 0:
-                avg_err = weighted_err / total_aum_t
-            else:
-                avg_err = 0.0
-
-            errors_at_t[reg_orig] = avg_err
-
-            # Dégradation du score : score(t-1) = score(t) * (1 - err_pondérée)
-            # Clippée entre 0 et score_curr
-            degradation = min(avg_err, 1.0)
-            new_scores[reg_orig] = score_curr * (1.0 - degradation)
-
-        scores = new_scores
-        scores_history[t_prev] = dict(scores)
-        errors_history[t_prev] = dict(errors_at_t)
-
-        total_score = sum(scores.values())
-        print(f"  {t_prev.date()} | Σ scores = {total_score:.4f} | "
-              f"Comptes actifs = {sum(1 for v in scores.values() if v > 0)}")
-
-    return scores_history, errors_history, mapping
-
-# ─────────────────────────────────────────────
-# 6. ÉTAPE 3 — Chirurgie de code
-# ─────────────────────────────────────────────
-def jaccard_isin(set_a, set_b):
-    """Coefficient de Jaccard entre deux ensembles d'ISIN."""
-    if not set_a or not set_b:
-        return 0.0
-    inter = len(set_a & set_b)
-    union = len(set_a | set_b)
-    return inter / union if union > 0 else 0.0
-
-def find_best_candidate(reg_orig, reg_curr, t_prev, t_curr,
-                        panel, flows_idx, all_regs_at_t_prev, mapping_inv):
-    """
-    Pour un reg_id dont le score a fortement chuté, cherche le meilleur
-    candidat j à t_prev tel que :
-      - j n'est pas déjà mappé à un autre compte original
-      - Le portefeuille ISIN de j à t_prev est similaire à celui de reg_curr à t_curr
-      - La réconciliation est bonne
-
-    Retourne (best_candidate, best_score_composite) ou (None, 0)
-    """
-    # ISIN du compte cible à t_curr
-    if reg_curr not in panel.columns.get_level_values(0):
-        return None, 0.0
-
-    isin_curr = set(panel[reg_curr].columns[
-        panel[reg_curr].loc[t_curr].notna() & (panel[reg_curr].loc[t_curr] != 0)
-    ].tolist())
-
-    if not isin_curr:
-        return None, 0.0
-
-    best_candidate = None
-    best_composite = 0.0
-
-    for j in all_regs_at_t_prev:
-        # Ne pas réutiliser un code déjà mappé
-        if j in mapping_inv:
-            continue
-        # Ne pas mapper sur soi-même si déjà présent
-        if j == reg_curr:
-            continue
-
-        if j not in panel.columns.get_level_values(0):
-            continue
-
-        # ISIN de j à t_prev
-        col_j = panel[j]
-        isin_j = set(col_j.columns[
-            col_j.loc[t_prev].notna() & (col_j.loc[t_prev] != 0)
-        ].tolist()) if t_prev in col_j.index else set()
-
-        if not isin_j:
-            continue
-
-        jac = jaccard_isin(isin_curr, isin_j)
-        if jac < MIN_JACCARD:
-            continue
-
-        # Erreur de réconciliation pour les ISIN communs
-        common_isin = isin_curr & isin_j
-        total_aum = 0
-        weighted_err = 0
-
-        for isin in common_isin:
-            qty_t = panel[reg_curr][isin].get(t_curr, np.nan) if isin in panel[reg_curr].columns else np.nan
-            qty_t_prev = panel[j][isin].get(t_prev, np.nan) if isin in panel[j].columns else np.nan
-
-            if pd.isna(qty_t) or pd.isna(qty_t_prev):
-                continue
-
-            try:
-                net_flow = flows_idx.loc[(t_curr, j, isin)]
-            except KeyError:
-                net_flow = 0.0
-
-            err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
-            weight_isin = abs(qty_t)
-            weighted_err += err_ratio * weight_isin
-            total_aum += weight_isin
-
-        avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
-
-        composite = jac * (1.0 - min(avg_err, 1.0))
-
-        if composite > best_composite:
-            best_composite = composite
-            best_candidate = j
-
-    return best_candidate, best_composite
-
-
-def _recompute_score_with_candidate(reg_orig, candidate, t_prev, t_curr,
-                                     panel, flows_idx, score_curr):
-    """
-    Recalcule proprement l'erreur de réconciliation pour un candidat donné,
-    et retourne le score après chirurgie.
-    """
-    if candidate not in panel.columns.get_level_values(0):
-        return score_curr * 0  # candidat inexistant
-
-    isin_list_cand = panel[candidate].columns.tolist()
-    isin_list_curr = panel[reg_orig].columns.tolist() if reg_orig in panel.columns.get_level_values(0) else []
-
-    total_aum = 0
-    weighted_err = 0
-
-    for isin in isin_list_curr:
-        qty_t = panel[reg_orig][isin].get(t_curr, np.nan) if isin in panel[reg_orig].columns else np.nan
-        if pd.isna(qty_t) or qty_t == 0:
-            continue
-
-        qty_t_prev = panel[candidate][isin].get(t_prev, np.nan) if isin in panel[candidate].columns else np.nan
-
-        try:
-            net_flow = flows_idx.loc[(t_curr, candidate, isin)]
-        except KeyError:
-            net_flow = 0.0
-
-        if pd.isna(qty_t_prev):
-            err_ratio = 1.0
-        else:
-            err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
-
-        weight_isin = abs(qty_t)
-        weighted_err += err_ratio * weight_isin
-        total_aum += weight_isin
-
-    avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
-    return score_curr * (1.0 - min(avg_err, 1.0))
-
-
-def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
-                     monthly_flows_lag, weights, universe, all_months,
-                     broken_months=None, lag_months=None):
-    """
-    Deuxième passe : pour chaque mois avec des ruptures fortes,
-    tente une chirurgie de code et recalcule les scores.
-
-    Corrections par rapport à la passe naïve :
-    - Après chirurgie, le score est recalculé proprement (pas juste composite)
-    - Le mapping propagé en arrière utilise le bon code à chaque étape
-    - Pré-filtre ISIN pour performance sur grand dataset
-
-    Retourne :
-      - mapping_history : {date → {reg_orig → reg_used}}
-      - surgery_log     : liste des opérations effectuées
-      - scores_final    : scores au dernier mois
-    """
-    flows_idx     = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
-    flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
-
-    # Tous les reg_ids présents dans le panel (univers + codes historiques)
-    all_regs_in_panel = set(panel.columns.get_level_values(0))
-
-    # Pré-calcul : ensemble d'ISIN par reg_id à chaque date (pour pré-filtre rapide)
-    # {reg_id → {date → set(isin)}}
-    reg_isin_at_date = {}
-    for reg in all_regs_in_panel:
-        reg_isin_at_date[reg] = {}
-        col = panel[reg]
-        for date in col.index:
-            active = set(col.columns[(col.loc[date].notna()) & (col.loc[date] != 0)].tolist())
-            if active:
-                reg_isin_at_date[reg][date] = active
-
-    # Mapping courant : reg_orig → reg_used
-    mapping = {r: r for r in universe}
-    mapping_inv = {r: r for r in universe}
-
-    surgery_log = []
-    mapping_history = {all_months[-1]: dict(mapping)}
-    scores_history_corrected = {all_months[-1]: dict(weights)}
-
-    # Scores courants (initialisés à t_ref)
-    scores = dict(weights)
-
-    for i in range(len(all_months) - 2, -1, -1):
-        t_prev = all_months[i]
-        t_curr = all_months[i + 1]
-
-        new_scores = {}
-        new_mapping = {}
-
-        for reg_orig in list(mapping.keys()):
-            reg_curr = mapping[reg_orig]
-            score_curr = scores.get(reg_orig, 0)
-
-            if score_curr == 0:
-                new_scores[reg_orig] = 0
-                new_mapping[reg_orig] = reg_curr
-                continue
-
-            # Erreur sans chirurgie (depuis étape 2)
-            err = errors_history.get(t_prev, {}).get(reg_orig, 0.0)
-            score_prev_no_surgery = score_curr * (1.0 - min(err, 1.0))
-            drop_ratio = 1.0 - (score_prev_no_surgery / score_curr) if score_curr > 0 else 0
-
-            if drop_ratio > SCORE_DROP_THRESHOLD:
-                # ── ISIN du compte courant à t_curr (pour pré-filtre) ──
-                isin_curr = reg_isin_at_date.get(reg_curr, {}).get(t_curr, set())
-
-                # ── Candidats disponibles ──
-                # On exclut les codes déjà mappés à un autre compte,
-                # mais reg_curr lui-même est un candidat valide (self-mapping :
-                # le compte existait déjà sous ce code à t-1, dormant ou partiel).
-                available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
-
-                best_candidate = None
-                best_score_after = score_prev_no_surgery  # baseline = pas de chirurgie
-                best_composite = 0.0
-
-                for j in available:
-                    # Pré-filtre rapide : overlap ISIN minimal
-                    isin_j = reg_isin_at_date.get(j, {}).get(t_prev, set())
-                    if not isin_curr or not isin_j:
-                        continue
-                    inter = len(isin_curr & isin_j)
-                    if inter == 0:
-                        continue
-                    jac = inter / len(isin_curr | isin_j)
-                    if jac < MIN_JACCARD:
-                        continue
-
-                    # Score après chirurgie avec ce candidat
-                    score_after = _recompute_score_with_candidate(
-                        reg_curr, j, t_prev, t_curr, panel, flows_idx, score_curr
-                    )
-                    composite = jac * (score_after / score_curr) if score_curr > 0 else 0
-
-                    if score_after > best_score_after:
-                        best_score_after = score_after
-                        best_candidate = j
-                        best_composite = composite
-
-                if best_candidate:
-                    surgery_log.append({
-                        'date': t_prev,
-                        'reg_orig': reg_orig,
-                        'reg_from': reg_curr,
-                        'reg_to': best_candidate,
-                        'jaccard_composite': round(best_composite, 4),
-                        'score_before': round(score_curr, 6),
-                        'score_after': round(best_score_after, 6),
-                        'drop_without_surgery': round(drop_ratio, 4),
-                        'gain_vs_no_surgery': round(best_score_after - score_prev_no_surgery, 6),
-                    })
-                    print(f"  🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
-                          f"{reg_curr} → {best_candidate} "
-                          f"(composite={best_composite:.3f}, "
-                          f"score {score_curr:.4f}→{best_score_after:.4f})")
-
-                    # Mise à jour mapping
-                    # Si self-mapping (best_candidate == reg_curr), on ne touche pas
-                    # mapping_inv car le code ne change pas — on met juste à jour le score.
-                    if best_candidate != reg_curr:
-                        if reg_curr in mapping_inv:
-                            del mapping_inv[reg_curr]
-                        mapping_inv[best_candidate] = reg_orig
-                    new_mapping[reg_orig] = best_candidate
-                    new_scores[reg_orig] = best_score_after
-                else:
-                    new_mapping[reg_orig] = reg_curr
-                    new_scores[reg_orig] = score_prev_no_surgery
-            else:
-                new_mapping[reg_orig] = reg_curr
-                new_scores[reg_orig] = score_prev_no_surgery
-
-        mapping = new_mapping
-        mapping_inv = {v: k for k, v in mapping.items()}
-        scores = new_scores
-        mapping_history[t_prev] = dict(mapping)
-        scores_history_corrected[t_prev] = dict(scores)
-
-        total_score = sum(s for s in scores.values() if not np.isnan(s))
-        n_surgeries = sum(1 for op in surgery_log if op['date'] == t_prev)
-        print(f"  {t_prev.date()} | Σ scores = {total_score:.4f} | "
-              f"Chirurgies = {n_surgeries}")
-
-    return mapping_history, surgery_log, scores, scores_history_corrected
-
-# ─────────────────────────────────────────────
-# 7. EXPORT RÉSULTATS
-# ─────────────────────────────────────────────
-def export_results(scores_history, mapping_history, surgery_log, all_months, out_prefix="carmignac"):
-    """Exporte les résultats clés en CSV."""
-
-    # Score history
-    rows = []
-    for date, sc in scores_history.items():
-        for reg, score in sc.items():
-            rows.append({'date': date, 'reg_id': reg, 'score': score})
-    df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
-    if not df_scores.empty:
-        df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
-    df_scores.to_csv(f"repair_results/{out_prefix}_scores.csv", index=False)
-
-    # Mapping history
-    rows_m = []
-    for date, mp in mapping_history.items():
-        for reg_orig, reg_used in mp.items():
-            rows_m.append({'date': date, 'reg_orig': reg_orig, 'reg_used': reg_used,
-                           'changed': reg_orig != reg_used})
-    df_mapping = pd.DataFrame(rows_m) if rows_m else pd.DataFrame(columns=['date', 'reg_orig', 'reg_used', 'changed'])
-    if not df_mapping.empty:
-        df_mapping = df_mapping.sort_values(['date', 'reg_orig'])
-    df_mapping.to_csv(f"repair_results/{out_prefix}_mapping.csv", index=False)
-
-    # Surgery log
-    if surgery_log:
-        df_surgery = pd.DataFrame(surgery_log).sort_values('date')
-        df_surgery.to_csv(f"repair_results/{out_prefix}_surgery_log.csv", index=False)
-        print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
-    else:
-        print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
-
-    print(f"[Export] Scores    → {out_prefix}_scores.csv")
-    print(f"[Export] Mapping   → {out_prefix}_mapping.csv")
-
-    return df_scores, df_mapping
-
-# ─────────────────────────────────────────────
-# 8. PIPELINE PRINCIPAL
-# ─────────────────────────────────────────────
-def run_pipeline(broken_months_path=None):
-    print("=" * 60)
-    print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
-    print("=" * 60)
-
-    # Chargement
-    aum, flows = load_data()
-
-    # Broken months (optional — produced by carmignac_diagnostics.py)
-    broken_months, lag_months = load_broken_months(broken_months_path)
-
-    # Étape 1 — Univers de référence
-    aum_ref, weights, universe, t_ref = build_reference_universe(aum)
-
-    print(f"\n  Top 5 comptes par poids :")
-    for reg, w in sorted(weights.items(), key=lambda x: -x[1])[:5]:
-        print(f"    {reg} : {w:.4f} ({w*100:.2f}%)")
-
-    # Panel mensuel
-    panel, all_months = build_monthly_panel(aum, universe, t_ref)
-
-    # Flows mensuels agrégés (standard + lag window)
-    monthly_flows, monthly_flows_lag = aggregate_flows_monthly(flows, all_months)
-
-    # Étape 2 — Score de cohérence (sans chirurgie)
-    print("\n[Étape 2] Propagation des scores (sans chirurgie)...")
-    scores_history, errors_history, _ = score_propagation(
-        panel, monthly_flows, monthly_flows_lag, weights, universe, all_months,
-        broken_months=broken_months, lag_months=lag_months
-    )
-
-    # Étape 3 — Chirurgie
-    print("\n[Étape 3] Passe de chirurgie...")
-    mapping_history, surgery_log, final_scores, scores_history_corrected = run_surgery_pass(
-        scores_history, errors_history, panel, monthly_flows, monthly_flows_lag,
-        weights, universe, all_months,
-        broken_months=broken_months, lag_months=lag_months
-    )
-
-    # Export — on utilise les scores corrigés (post-chirurgie) comme référence
-    print("\n[Export des résultats...]")
-    df_scores, df_mapping = export_results(
-        scores_history_corrected, mapping_history, surgery_log, all_months
-    )
-
-
-    # Résumé final
-    print("\n" + "=" * 60)
-    print("RÉSUMÉ FINAL")
-    print("=" * 60)
-    print(f"  Dates couvertes        : {all_months[0].date()} → {all_months[-1].date()}")
-    print(f"  Comptes dans l'univers : {len(universe)}")
-    print(f"  Chirurgies effectuées  : {len(surgery_log)}")
-    score_by_date = {d: sum(s for s in sc.values() if s == s)
-                     for d, sc in scores_history_corrected.items()}
-    print(f"  Σ scores à t_ref       : {score_by_date[t_ref]:.4f}")
-    print(f"  Σ scores à t_min       : {score_by_date[all_months[0]]:.4f}")
-
-    return df_scores, df_mapping, surgery_log, scores_history_corrected, mapping_history
-
-
-if __name__ == "__main__":
-    df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
-        broken_months_path="carmignac_broken_months.csv"  # optional
-    )