Project_Carmignac/carmignac_diagnostics.py

963 lines
39 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Carmignac Data Challenge — Broken Months Diagnostics
=====================================================
Detects months where the aggregate stock-flow equation is violated
at the ISIN level (across all accounts):
Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1) ≠ Σ_r F_{r,s}(t-1→t)
The residual is the "missing flow":
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
This is a market-level check, independent of individual account identity.
It captures:
- Genuinely missing flow records
- End-of-month accounting lags (transactions dated at boundary)
- Corporate actions (dividends, splits) not reflected in flows
Outputs
-------
carmignac_broken_months.csv — machine-readable, loaded by carmignac_repair.py
carmignac_diagnostics.html — interactive HTML report
Usage
-----
python carmignac_diagnostics.py
python carmignac_diagnostics.py \\
--aum raw_AUM.csv \\
--flows raw_flows.csv \\
--out carmignac_broken_months.csv \\
--html carmignac_diagnostics.html \\
--alpha 0.02
"""
import argparse
import json
import os
import sys
import numpy as np
import pandas as pd
# ─────────────────────────────────────────────────────────────
# 1. LOAD
# ─────────────────────────────────────────────────────────────
def load_data(aum_path, flows_path):
aum = pd.read_csv(aum_path, parse_dates=["Centralisation Date"])
flows = pd.read_csv(flows_path, parse_dates=["Centralisation Date"])
aum["Product - Isin"] = aum["Product - Isin"].astype(str)
flows["Product - Isin"] = flows["Product - Isin"].astype(str)
return aum, flows
# ─────────────────────────────────────────────────────────────
# 2. AGGREGATE AND DETECT BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
For each (isin, month-end t), compute:
- Q_agg(t) : total shares held across all accounts
- Q_agg(t-1) : idem previous month (forward-filled)
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
A month is flagged as "broken" when missing_pct > alpha.
Additionally, a month is flagged as a potential "lag" when:
- It is broken with the standard window
- But would NOT be broken if flows dated within lag_days of EOM
are shifted to the adjacent month
Parameters
----------
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
lag_days : number of boundary days to test for accounting lag
Returns
-------
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
df_all : Full DataFrame including non-broken months (for plotting)
"""
# Monthly calendar
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Aggregate AUM per (isin, month-end) ──────────────────────
aum_agg = (
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
.sum()
.reset_index()
.rename(columns={"Product - Isin": "isin",
"Centralisation Date": "date",
"Quantity - AUM": "qty_agg"})
)
# Forward-fill sparse panel
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
aum_pivot = aum_pivot.reindex(all_months).ffill()
# ── Aggregate flows per (isin, month-end) — standard window ──
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
"""Aggregate flows with optional boundary extension (in days)."""
fc = flows_df.copy()
def assign_month(d):
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
agg = (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
.sum()
.reset_index()
.rename(columns={"Product - Isin": "isin",
"month_end": "date",
"Quantity - NetFlows": "flow_agg"}))
return agg
flows_std = bucket_flows(flows, all_months)
flows_lag = bucket_flows(flows, all_months,
lower_offset=lag_days,
upper_offset=lag_days)
def flows_to_pivot(df, months):
piv = df.pivot(index="date", columns="isin", values="flow_agg")
return piv.reindex(months).fillna(0.0)
fpiv_std = flows_to_pivot(flows_std, all_months)
fpiv_lag = flows_to_pivot(flows_lag, all_months)
# ── Compute residuals ─────────────────────────────────────────
rows = []
isins = aum_pivot.columns.tolist()
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
for isin in isins:
q_curr = aum_pivot[isin].get(t_curr, np.nan) if isin in aum_pivot.columns else np.nan
q_prev = aum_pivot[isin].get(t_prev, np.nan) if isin in aum_pivot.columns else np.nan
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
# Standard window
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
missing_std = delta - f_std
# Extended lag window
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
missing_lag = delta - f_lag
# ── Denominator choice ────────────────────────────────
# Normalise by the size of the *movement* (max of delta_AUM
# and recorded flow), not by the stock level. This avoids
# astronomically large percentages when a position is tiny
# but the missing flow is a normal-sized number.
#
# Interpretation: "what fraction of the expected movement
# is unaccounted for?" 100% = the entire movement is missing.
#
# A minimum absolute threshold (min_abs_shares) suppresses
# noise from residual micro-positions (rounding artefacts).
min_abs_shares = 1.0 # ignore positions smaller than 1 share
movement = max(abs(delta), abs(f_std), min_abs_shares)
denom_std = movement
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
denom_lag = movement_lag
pct_std = abs(missing_std) / denom_std
pct_lag = abs(missing_lag) / denom_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
# A "lag" month: broken with standard, NOT broken with extended window
is_lag = broken_std and (not broken_lag)
rows.append({
"date": t_curr,
"isin": isin,
"q_agg_prev": round(q_prev, 3),
"q_agg_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_agg": round(f_std, 3),
"missing_flow": round(missing_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
})
df_all = pd.DataFrame(rows)
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
return df_broken, df_all
# ─────────────────────────────────────────────────────────────
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
Same stock-flow check as detect_broken_months, but aggregated
across ALL ISINs for each month:
Q_total(t) - Q_total(t-1) != F_total(t)
where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
This catches months where the global portfolio is incoherent even
if every individual ISIN is fine (e.g. cross-ISIN netting errors),
and provides a cleaner high-level view.
Returns
-------
df_agg : DataFrame indexed by month with columns:
q_total_prev, q_total_curr, delta_aum, flow_total,
missing_flow, missing_pct, broken, is_lag
"""
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Total AUM per month (all ISIN, all accounts) ─────────────
aum_monthly = (
aum.groupby("Centralisation Date")["Quantity - AUM"]
.sum()
.reindex(all_months)
.ffill()
.rename("q_total")
)
# ── Bucket flows helper (reuse same window logic) ─────────────
def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
fc = flows_df.copy()
def assign_month(d):
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
return (fc.groupby("month_end")["Quantity - NetFlows"]
.sum()
.reindex(months)
.fillna(0.0))
flow_std = bucket_total_flows(flows, all_months)
flow_lag = bucket_total_flows(flows, all_months,
lower_offset=lag_days, upper_offset=lag_days)
# ── Compute residuals ─────────────────────────────────────────
rows = []
min_abs_shares = 1.0
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
q_curr = aum_monthly.get(t_curr, np.nan)
q_prev = aum_monthly.get(t_prev, np.nan)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
f_std = flow_std.get(t_curr, 0.0)
f_lag = flow_lag.get(t_curr, 0.0)
miss_std = delta - f_std
miss_lag = delta - f_lag
movement_std = max(abs(delta), abs(f_std), min_abs_shares)
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
pct_std = abs(miss_std) / movement_std
pct_lag = abs(miss_lag) / movement_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
is_lag = broken_std and (not broken_lag)
rows.append({
"date": t_curr,
"q_total_prev": round(q_prev, 3),
"q_total_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_total": round(f_std, 3),
"missing_flow": round(miss_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
})
df_agg = pd.DataFrame(rows)
return df_agg
# ─────────────────────────────────────────────────────────────
# 3. PRINT SUMMARY
# ─────────────────────────────────────────────────────────────
def print_summary(df_broken, df_all, alpha):
total = len(df_all)
n_broken = len(df_broken)
n_lag = df_broken["is_lag"].sum()
print("\n" + "=" * 60)
print(" CARMIGNAC — Broken Months Diagnostics")
print("=" * 60)
print(f" (isin, month) pairs examined : {total}")
print(f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
f"({n_broken/total*100:.1f}%)")
print(f" Of which likely lag : {n_lag}")
print(f" Of which genuine gap : {n_broken - n_lag}")
if n_broken:
print("\n Top 10 by missing_pct:")
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
print(df_broken[cols].head(10).to_string(index=False))
# Monthly breakdown
by_month = (df_broken.groupby("date")
.agg(n_broken=("isin", "count"),
total_missing=("missing_flow", lambda x: x.abs().sum()))
.sort_values("n_broken", ascending=False)
.head(5))
if len(by_month):
print("\n Most affected months:")
print(by_month.to_string())
print()
# ─────────────────────────────────────────────────────────────
# 4. BUILD HTML REPORT
# ─────────────────────────────────────────────────────────────
def build_html(df_broken, df_all, df_agg, alpha):
# ── JS-ready data ────────────────────────────────────────────
# Timeline: n_broken and total_missing per month
tl = (df_all[df_all["broken"]]
.groupby("date")
.agg(n_broken=("isin", "count"),
total_missing=("missing_flow", lambda x: x.abs().sum()),
n_lag=("is_lag", "sum"))
.reindex(df_all["date"].sort_values().unique())
.fillna(0))
tl.index = pd.to_datetime(tl.index)
dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index])
def jf(arr, dec=4):
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
n_broken_js = jf(tl["n_broken"].values, 0)
total_miss_js = jf(tl["total_missing"].values)
n_lag_js = jf(tl["n_lag"].values, 0)
# Aggregate (cross-ISIN) JS data
agg_dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])])
agg_delta_js = jf(df_agg["delta_aum"].values)
agg_flow_js = jf(df_agg["flow_total"].values)
agg_missing_js = jf(df_agg["missing_flow"].values)
agg_pct_js = jf((df_agg["missing_pct"] * 100).values)
# Aggregate KPIs
n_agg_broken = int(df_agg["broken"].sum())
n_agg_lag = int(df_agg["is_lag"].sum())
n_agg_genuine = n_agg_broken - n_agg_lag
max_agg_pct = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0
# Aggregate detail table rows
agg_rows = []
for _, r in df_agg[df_agg["broken"]].iterrows():
lb = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
pc = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
mc = "miss-neg" if r["missing_flow"] < 0 else "miss-pos"
agg_rows.append(
f'<tr><td>{ds}</td>'
f'<td class="mono right">{r["q_total_prev"]:,.1f}</td>'
f'<td class="mono right">{r["q_total_curr"]:,.1f}</td>'
f'<td class="mono right">{r["flow_total"]:,.1f}</td>'
f'<td class="mono right {mc}">{r["missing_flow"]:+,.1f}</td>'
f'<td class="mono right {pc}">{r["missing_pct"]*100:.2f}%</td>'
f'<td>{lb}</td></tr>'
)
agg_detail_rows = "".join(agg_rows) if agg_rows else (
'<tr><td colspan="7" style="padding:24px;text-align:center;'
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
)
# Per-ISIN summary
isin_sum = (df_broken.groupby("isin")
.agg(n_months=("date", "count"),
avg_pct=("missing_pct", "mean"),
total_abs=("missing_flow", lambda x: x.abs().sum()))
.sort_values("total_abs", ascending=False))
ISIN_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
]
# Per-ISIN missing_pct timeseries for the top 5 ISINs
top_isins = isin_sum.head(5).index.tolist()
all_dates = sorted(df_all["date"].unique())
isin_ts_datasets = []
for idx, isin in enumerate(top_isins):
sub = df_all[df_all["isin"] == isin].set_index("date")["missing_pct"].reindex(all_dates).fillna(0)
isin_ts_datasets.append({
"label": isin,
"data": [round(float(v) * 100, 3) for v in sub.values],
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
"borderWidth": 2,
"pointRadius": 0,
"tension": 0.3,
"fill": False,
})
isin_ts_json = json.dumps(isin_ts_datasets)
all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
else str(d)[:10] for d in all_dates])
# Detail table rows
detail_rows = ""
for _, r in df_broken.head(200).iterrows():
lag_badge = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
detail_rows += f"""
<tr>
<td>{r['date'].strftime('%Y-%m-%d') if hasattr(r['date'], 'strftime') else str(r['date'])[:10]}</td>
<td class="mono">{r['isin']}</td>
<td class="mono right">{r['q_agg_prev']:,.1f}</td>
<td class="mono right">{r['q_agg_curr']:,.1f}</td>
<td class="mono right">{r['flow_agg']:,.1f}</td>
<td class="mono right {'miss-neg' if r['missing_flow'] < 0 else 'miss-pos'}">{r['missing_flow']:+,.1f}</td>
<td class="mono right {pct_class}">{r['missing_pct']*100:.2f}%</td>
<td>{lag_badge}</td>
</tr>"""
# ISIN summary table
isin_rows = ""
for isin, row in isin_sum.iterrows():
isin_rows += f"""
<tr>
<td class="mono">{isin}</td>
<td class="mono right">{int(row['n_months'])}</td>
<td class="mono right">{row['avg_pct']*100:.2f}%</td>
<td class="mono right">{row['total_abs']:,.1f}</td>
</tr>"""
# KPIs
total = len(df_all)
n_broken_kpi = len(df_broken)
n_lag_kpi = int(df_broken["is_lag"].sum())
n_genuine = n_broken_kpi - n_lag_kpi
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
n_isins = df_broken["isin"].nunique()
no_broken_msg = ""
if n_broken_kpi == 0:
no_broken_msg = '<div class="no-broken">✓ No broken months detected at this threshold.</div>'
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Carmignac — Broken Months Diagnostics</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
<style>
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;600;700&display=swap');
:root {{
--bg: #0d0f12; --surface: #151820; --border: #252a35;
--accent: #3b82f6; --warn: #f59e0b; --danger: #ef4444;
--success: #10b981; --text: #e2e8f0; --muted: #64748b;
--mono: 'IBM Plex Mono', monospace;
--sans: 'IBM Plex Sans', sans-serif;
}}
*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: var(--sans); background: var(--bg); color: var(--text);
padding: 0 0 60px; }}
.header {{ background: linear-gradient(135deg,#0d1117,#111827,#1a0a0a);
border-bottom: 1px solid var(--border); padding: 40px 48px 36px; }}
.header-eyebrow {{ font-family: var(--mono); font-size: 11px; letter-spacing:.15em;
color: var(--danger); text-transform: uppercase; margin-bottom:10px; }}
.header h1 {{ font-size: 2rem; font-weight: 700; letter-spacing:-.02em; margin-bottom:8px; }}
.header-sub {{ font-size:.85rem; color: var(--muted); font-family: var(--mono); }}
.kpi-strip {{ display: grid; grid-template-columns: repeat(auto-fit,minmax(160px,1fr));
gap: 1px; background: var(--border); border-bottom: 1px solid var(--border); }}
.kpi {{ background: var(--surface); padding: 22px 28px;
display: flex; flex-direction: column; gap: 4px; }}
.kpi-label {{ font-size:.7rem; letter-spacing:.1em; text-transform:uppercase;
color: var(--muted); font-family: var(--mono); }}
.kpi-value {{ font-size:1.6rem; font-weight:700; font-family: var(--mono); line-height:1; }}
.kpi-value.danger {{ color: var(--danger); }}
.kpi-value.warn {{ color: var(--warn); }}
.kpi-value.success {{ color: var(--success); }}
.kpi-sub {{ font-size:.7rem; color: var(--muted); font-family: var(--mono); }}
.main {{ max-width:1400px; margin:0 auto; padding:36px 48px;
display:flex; flex-direction:column; gap:32px; }}
.card {{ background: var(--surface); border: 1px solid var(--border);
border-radius:8px; overflow:hidden; }}
.card-header {{ padding:18px 24px 14px; border-bottom:1px solid var(--border);
display:flex; align-items:baseline; gap:12px; }}
.card-title {{ font-size:.8rem; font-weight:600; letter-spacing:.1em;
text-transform:uppercase; color: var(--muted); font-family: var(--mono); }}
.card-desc {{ font-size:.78rem; color: #475569; }}
.card-body {{ padding:24px; }}
.chart-wrap {{ position:relative; height:260px; }}
.chart-wrap-tall {{ position:relative; height:320px; }}
.grid-2 {{ display:grid; grid-template-columns:1fr 1fr; gap:24px; }}
@media(max-width:900px) {{ .grid-2 {{ grid-template-columns:1fr; }}
.main {{ padding:24px 20px; }} }}
.section-label {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.15em;
text-transform:uppercase; color: var(--muted);
padding-left:10px; border-left:3px solid var(--danger);
margin-bottom:-8px; }}
table {{ width:100%; border-collapse:collapse; font-size:.82rem; }}
th {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.08em;
text-transform:uppercase; color: var(--muted); padding:10px 14px;
text-align:left; border-bottom:1px solid var(--border); background:#0f1218; }}
td {{ padding:10px 14px; border-bottom:1px solid #1a1f2a; vertical-align:middle; }}
tr:last-child td {{ border-bottom:none; }}
tr:hover td {{ background:#181e2b; }}
.mono {{ font-family: var(--mono); font-size:.78rem; }}
.right {{ text-align:right; }}
.miss-pos {{ color: var(--warn); }}
.miss-neg {{ color: var(--danger); }}
.pct-high {{ color: var(--danger); font-weight:600; }}
.pct-med {{ color: var(--warn); }}
.lag-badge {{ font-family: var(--mono); font-size:.65rem; padding:2px 6px;
background:#f59e0b22; border:1px solid #f59e0b66; border-radius:3px;
color: var(--warn); }}
.no-broken {{ padding:40px; text-align:center; color: var(--success);
font-family: var(--mono); font-size:.9rem; }}
.footer {{ text-align:center; font-family: var(--mono); font-size:.68rem;
color:#334155; margin-top:16px; letter-spacing:.05em; }}
.alpha-note {{ font-family: var(--mono); font-size:.75rem; color: var(--muted);
padding:10px 24px 0; }}
</style>
</head>
<body>
<div class="header">
<div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
<h1>Broken Months Diagnostics</h1>
<div class="header-sub">
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
<span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level</span>
</div>
</div>
<div class="kpi-strip">
<div class="kpi">
<span class="kpi-label">(ISIN, month) pairs</span>
<span class="kpi-value">{total:,}</span>
<span class="kpi-sub">examined</span>
</div>
<div class="kpi">
<span class="kpi-label">Broken months</span>
<span class="kpi-value {'danger' if n_broken_kpi > 0 else 'success'}">{n_broken_kpi:,}</span>
<span class="kpi-sub">{n_broken_kpi/total*100:.1f}% of pairs</span>
</div>
<div class="kpi">
<span class="kpi-label">Likely lags</span>
<span class="kpi-value warn">{n_lag_kpi}</span>
<span class="kpi-sub">resolved by ±{3}d window</span>
</div>
<div class="kpi">
<span class="kpi-label">Genuine gaps</span>
<span class="kpi-value {'danger' if n_genuine > 0 else 'success'}">{n_genuine}</span>
<span class="kpi-sub">unresolved by lag fix</span>
</div>
<div class="kpi">
<span class="kpi-label">ISINs affected</span>
<span class="kpi-value">{n_isins}</span>
<span class="kpi-sub">distinct ISINs</span>
</div>
<div class="kpi">
<span class="kpi-label">Max missing %</span>
<span class="kpi-value {'danger' if max_pct > 10 else 'warn'}">{max_pct:.1f}%</span>
<span class="kpi-sub">worst single (isin, month)</span>
</div>
</div>
<div class="main">
<div class="section-label">00 · Aggregate view — all ISINs combined</div>
<div class="card">
<div class="card-header">
<span class="card-title">Stock-flow equation — total portfolio</span>
<span class="card-desc">
Σ Q(t) Σ Q(t1) vs Σ F(t) across all ISINs and accounts.
Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
</span>
</div>
<div class="card-body">
<div class="chart-wrap-tall"><canvas id="chartAggOverlay"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate missing flow over time</span>
<span class="card-desc">Σ Q(t) Σ Q(t1) Σ F(t) — should be near zero every month</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartAggMissing"></canvas></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate missing % of movement</span>
<span class="card-desc">|missing| / max(|ΔAUM|, |flow|) — months above α flagged in red</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartAggPct"></canvas></div>
</div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate broken months — detail</span>
</div>
<div class="card-body" style="padding:0">
<table>
<thead><tr>
<th>Date</th>
<th class="right">Σ Q(t1)</th><th class="right">Σ Q(t)</th>
<th class="right">Σ Flow</th><th class="right">Missing</th>
<th class="right">Missing %</th><th></th>
</tr></thead>
<tbody>{agg_detail_rows}</tbody>
</table>
</div>
</div>
<div class="section-label">01 · Timeline — per ISIN</div>
<div class="card">
<div class="card-header">
<span class="card-title">Broken (isin, month) pairs per month</span>
<span class="card-desc">Stacked: genuine gaps (red) vs likely accounting lags (amber)</span>
</div>
<div class="card-body">
<div class="chart-wrap-tall"><canvas id="chartTimeline"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Total absolute missing flow per month</span>
<span class="card-desc">Sum of |missing flow| across all broken ISINs</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartMissing"></canvas></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Missing % — top 5 ISINs over time</span>
<span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
</div>
</div>
</div>
<div class="section-label">02 · By ISIN</div>
<div class="card">
<div class="card-header">
<span class="card-title">ISIN summary — most affected</span>
</div>
<div class="card-body" style="padding:0">
{'<div class="no-broken">No broken months detected.</div>' if n_broken_kpi == 0 else f"""
<table>
<thead><tr>
<th>ISIN</th><th>Broken months</th>
<th>Avg missing %</th><th>Total |missing| (shares)</th>
</tr></thead>
<tbody>{isin_rows}</tbody>
</table>"""}
</div>
</div>
<div class="section-label">03 · Detail log</div>
<div class="card">
<div class="card-header">
<span class="card-title">All broken (isin, month) pairs</span>
<span class="card-desc">
<span class="lag-badge">lag</span> = likely resolved by extending flow window ±3 days
</span>
</div>
<div class="alpha-note">Threshold α = {alpha:.1%} · showing up to 200 rows</div>
<div class="card-body" style="padding:0">
{'<div class="no-broken">✓ No broken months detected at this threshold.</div>' if n_broken_kpi == 0 else f"""
<table>
<thead><tr>
<th>Date</th><th>ISIN</th>
<th class="right">Q(t-1)</th><th class="right">Q(t)</th>
<th class="right">Net flow</th><th class="right">Missing</th>
<th class="right">Missing % of movement</th><th></th>
</tr></thead>
<tbody>{detail_rows}</tbody>
</table>"""}
</div>
</div>
</div>
<div class="footer">Generated by carmignac_diagnostics.py · Carmignac × ENSAE Data Challenge 2025</div>
<script>
Chart.defaults.color = '#64748b';
Chart.defaults.borderColor = '#1e2535';
Chart.defaults.font.family = "'IBM Plex Mono', monospace";
Chart.defaults.font.size = 11;
const DATES = {dates_str};
const N_BROKEN = {n_broken_js};
const N_LAG = {n_lag_js};
const TOT_MISS = {total_miss_js};
const ISIN_TS = {isin_ts_json};
const ALL_DATES = {all_dates_str};
function tip() {{
return {{
backgroundColor:'#0d1117', borderColor:'#252a35', borderWidth:1,
titleFont:{{family:"'IBM Plex Mono'"}}, bodyFont:{{family:"'IBM Plex Mono'"}}, padding:10
}};
}}
function xAxis() {{
return {{ type:'category', ticks:{{maxTicksLimit:10,maxRotation:0}},
grid:{{color:'#1a2030'}} }};
}}
function yAxis(label) {{
return {{ grid:{{color:'#1a2030'}},
title:{{display:!!label,text:label,color:'#475569'}} }};
}}
// n_genuine per month = N_BROKEN - N_LAG
const N_GENUINE = N_BROKEN.map((b,i) => b - (N_LAG[i]||0));
new Chart(document.getElementById('chartTimeline'), {{
type:'bar',
data:{{
labels: DATES,
datasets:[
{{ label:'Genuine gaps', data:N_GENUINE,
backgroundColor:'#ef444488', borderColor:'#ef4444', borderWidth:1, borderRadius:2 }},
{{ label:'Likely lags', data:N_LAG,
backgroundColor:'#f59e0b88', borderColor:'#f59e0b', borderWidth:1, borderRadius:2 }},
]
}},
options:{{
responsive:true, maintainAspectRatio:false,
interaction:{{mode:'index',intersect:false}},
plugins:{{
legend:{{position:'top',labels:{{boxWidth:12,padding:16}}}},
tooltip:tip()
}},
scales:{{ x:xAxis(), y:{{...yAxis('# (isin, month) pairs'), stacked:true}} }},
}}
}});
new Chart(document.getElementById('chartMissing'), {{
type:'bar',
data:{{
labels: DATES,
datasets:[{{ label:'|Missing flow| (shares)', data:TOT_MISS,
backgroundColor:'#dc262688', borderColor:'#dc2626',
borderWidth:1, borderRadius:2 }}]
}},
options:{{
responsive:true, maintainAspectRatio:false,
plugins:{{legend:{{display:false}}, tooltip:tip()}},
scales:{{ x:xAxis(), y:yAxis('Shares') }}
}}
}});
new Chart(document.getElementById('chartIsinTs'), {{
type:'line',
data:{{ labels:ALL_DATES, datasets:ISIN_TS }},
options:{{
responsive:true, maintainAspectRatio:false,
interaction:{{mode:'index',intersect:false}},
plugins:{{
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
tooltip:tip()
}},
scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
}}
}});
// ── Aggregate charts ─────────────────────────────────────────
const AGG_DATES = {agg_dates_str};
const AGG_DELTA = {agg_delta_js};
const AGG_FLOW = {agg_flow_js};
const AGG_MISSING = {agg_missing_js};
const AGG_PCT = {agg_pct_js};
const ALPHA = {alpha};
// Color each bar: red if broken, amber if lag, else subtle blue
const aggPctColors = AGG_PCT.map(v =>
Math.abs(v) > ALPHA * 100 ? '#ef444488' : '#3b82f622'
);
const aggPctBorders = AGG_PCT.map(v =>
Math.abs(v) > ALPHA * 100 ? '#ef4444' : '#3b82f655'
);
// Overlay: ΔAUM vs total flow
new Chart(document.getElementById('chartAggOverlay'), {{
type: 'line',
data: {{
labels: AGG_DATES,
datasets: [
{{ label: 'ΔAUM (Σ Q(t) Σ Q(t1))',
data: AGG_DELTA, borderColor: '#3b82f6', backgroundColor: '#3b82f622',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
{{ label: 'Σ Net flows recorded',
data: AGG_FLOW, borderColor: '#10b981', backgroundColor: '#10b98122',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
]
}},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{
legend: {{position:'top', labels:{{boxWidth:12, padding:16}}}},
tooltip: tip()
}},
scales: {{ x: xAxis(), y: yAxis('Shares') }}
}}
}});
// Missing flow bar
new Chart(document.getElementById('chartAggMissing'), {{
type: 'bar',
data: {{
labels: AGG_DATES,
datasets: [{{ label: 'Missing flow (shares)', data: AGG_MISSING,
backgroundColor: AGG_MISSING.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
borderColor: AGG_MISSING.map(v => v < 0 ? '#ef4444' : '#f59e0b'),
borderWidth: 1, borderRadius: 2 }}]
}},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{legend:{{display:false}}, tooltip: tip()}},
scales: {{ x: xAxis(), y: yAxis('Shares') }}
}}
}});
// Missing % bar, coloured by threshold
new Chart(document.getElementById('chartAggPct'), {{
type: 'bar',
data: {{
labels: AGG_DATES,
datasets: [{{ label: 'Missing % of movement', data: AGG_PCT,
backgroundColor: aggPctColors, borderColor: aggPctBorders,
borderWidth: 1, borderRadius: 2 }}]
}},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{
legend: {{display:false}},
tooltip: tip(),
annotation: {{}} // threshold line handled via color
}},
scales: {{ x: xAxis(), y: {{...yAxis('Missing (%)'), min: 0}} }}
}}
}});
</script>
</body>
</html>"""
return html
# ─────────────────────────────────────────────────────────────
# 5. MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="Detect broken months in Carmignac AUM/Flows data"
)
parser.add_argument("--aum", default="AUM_head.csv")
parser.add_argument("--flows", default="flows_head.csv")
parser.add_argument("--out", default="carmignac_broken_months.csv",
help="Machine-readable output (loaded by carmignac_repair.py)")
parser.add_argument("--html", default="carmignac_diagnostics.html")
parser.add_argument("--alpha", type=float, default=0.02,
help="Tolerance threshold (default 0.02 = 2%%)")
parser.add_argument("--lag", type=int, default=3,
help="Boundary days to test for accounting lag (default 3)")
args = parser.parse_args()
def resolve(p):
if os.path.exists(p): return p
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
if os.path.exists(alt): return alt
sys.exit(f"[ERROR] File not found: {p}")
print(f"[Load] AUM : {args.aum}")
print(f"[Load] Flows : {args.flows}")
aum, flows = load_data(resolve(args.aum), resolve(args.flows))
print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
print_summary(df_broken, df_all, args.alpha)
n_agg_broken = int(df_agg["broken"].sum())
print(f" Aggregate broken months : {n_agg_broken} "
f"(of which lags: {int(df_agg['is_lag'].sum())})")
# CSV output — this is what carmignac_repair.py will load
if len(df_broken):
df_broken.to_csv(args.out, index=False)
print(f"[Export] Broken months CSV → {args.out}")
else:
pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
print(f"[Export] No broken months — empty CSV → {args.out}")
html = build_html(df_broken, df_all, df_agg, args.alpha)
with open(args.html, "w", encoding="utf-8") as f:
f.write(html)
print(f"[Export] HTML report → {args.html}")
if __name__ == "__main__":
main()