paco-dev #2

Merged
pgoze-ensae merged 39 commits from paco-dev into main 2026-04-13 21:51:04 +02:00
7 changed files with 836 additions and 12387 deletions
Showing only changes of commit a04985d4d4 - Show all commits

View File

@ -1,962 +0,0 @@
"""
Carmignac Data Challenge Broken Months Diagnostics
=====================================================
Detects months where the aggregate stock-flow equation is violated
at the ISIN level (across all accounts):
Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1) Σ_r F_{r,s}(t-1t)
The residual is the "missing flow":
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
This is a market-level check, independent of individual account identity.
It captures:
- Genuinely missing flow records
- End-of-month accounting lags (transactions dated at boundary)
- Corporate actions (dividends, splits) not reflected in flows
Outputs
-------
carmignac_broken_months.csv machine-readable, loaded by carmignac_repair.py
carmignac_diagnostics.html interactive HTML report
Usage
-----
python carmignac_diagnostics.py
python carmignac_diagnostics.py \\
--aum raw_AUM.csv \\
--flows raw_flows.csv \\
--out carmignac_broken_months.csv \\
--html carmignac_diagnostics.html \\
--alpha 0.02
"""
import argparse
import json
import os
import sys
import numpy as np
import pandas as pd
# ─────────────────────────────────────────────────────────────
# 1. LOAD
# ─────────────────────────────────────────────────────────────
def load_data(aum_path, flows_path):
aum = pd.read_csv(aum_path, parse_dates=["Centralisation Date"])
flows = pd.read_csv(flows_path, parse_dates=["Centralisation Date"])
aum["Product - Isin"] = aum["Product - Isin"].astype(str)
flows["Product - Isin"] = flows["Product - Isin"].astype(str)
return aum, flows
# ─────────────────────────────────────────────────────────────
# 2. AGGREGATE AND DETECT BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
For each (isin, month-end t), compute:
- Q_agg(t) : total shares held across all accounts
- Q_agg(t-1) : idem previous month (forward-filled)
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
A month is flagged as "broken" when missing_pct > alpha.
Additionally, a month is flagged as a potential "lag" when:
- It is broken with the standard window
- But would NOT be broken if flows dated within lag_days of EOM
are shifted to the adjacent month
Parameters
----------
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
lag_days : number of boundary days to test for accounting lag
Returns
-------
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
df_all : Full DataFrame including non-broken months (for plotting)
"""
# Monthly calendar
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Aggregate AUM per (isin, month-end) ──────────────────────
aum_agg = (
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
.sum()
.reset_index()
.rename(columns={"Product - Isin": "isin",
"Centralisation Date": "date",
"Quantity - AUM": "qty_agg"})
)
# Forward-fill sparse panel
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
aum_pivot = aum_pivot.reindex(all_months).ffill()
# ── Aggregate flows per (isin, month-end) — standard window ──
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
"""Aggregate flows with optional boundary extension (in days)."""
fc = flows_df.copy()
def assign_month(d):
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
agg = (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
.sum()
.reset_index()
.rename(columns={"Product - Isin": "isin",
"month_end": "date",
"Quantity - NetFlows": "flow_agg"}))
return agg
flows_std = bucket_flows(flows, all_months)
flows_lag = bucket_flows(flows, all_months,
lower_offset=lag_days,
upper_offset=lag_days)
def flows_to_pivot(df, months):
piv = df.pivot(index="date", columns="isin", values="flow_agg")
return piv.reindex(months).fillna(0.0)
fpiv_std = flows_to_pivot(flows_std, all_months)
fpiv_lag = flows_to_pivot(flows_lag, all_months)
# ── Compute residuals ─────────────────────────────────────────
rows = []
isins = aum_pivot.columns.tolist()
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
for isin in isins:
q_curr = aum_pivot[isin].get(t_curr, np.nan) if isin in aum_pivot.columns else np.nan
q_prev = aum_pivot[isin].get(t_prev, np.nan) if isin in aum_pivot.columns else np.nan
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
# Standard window
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
missing_std = delta - f_std
# Extended lag window
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
missing_lag = delta - f_lag
# ── Denominator choice ────────────────────────────────
# Normalise by the size of the *movement* (max of delta_AUM
# and recorded flow), not by the stock level. This avoids
# astronomically large percentages when a position is tiny
# but the missing flow is a normal-sized number.
#
# Interpretation: "what fraction of the expected movement
# is unaccounted for?" 100% = the entire movement is missing.
#
# A minimum absolute threshold (min_abs_shares) suppresses
# noise from residual micro-positions (rounding artefacts).
min_abs_shares = 1.0 # ignore positions smaller than 1 share
movement = max(abs(delta), abs(f_std), min_abs_shares)
denom_std = movement
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
denom_lag = movement_lag
pct_std = abs(missing_std) / denom_std
pct_lag = abs(missing_lag) / denom_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
# A "lag" month: broken with standard, NOT broken with extended window
is_lag = broken_std and (not broken_lag)
rows.append({
"date": t_curr,
"isin": isin,
"q_agg_prev": round(q_prev, 3),
"q_agg_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_agg": round(f_std, 3),
"missing_flow": round(missing_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
})
df_all = pd.DataFrame(rows)
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
return df_broken, df_all
# ─────────────────────────────────────────────────────────────
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
Same stock-flow check as detect_broken_months, but aggregated
across ALL ISINs for each month:
Q_total(t) - Q_total(t-1) != F_total(t)
where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
This catches months where the global portfolio is incoherent even
if every individual ISIN is fine (e.g. cross-ISIN netting errors),
and provides a cleaner high-level view.
Returns
-------
df_agg : DataFrame indexed by month with columns:
q_total_prev, q_total_curr, delta_aum, flow_total,
missing_flow, missing_pct, broken, is_lag
"""
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Total AUM per month (all ISIN, all accounts) ─────────────
aum_monthly = (
aum.groupby("Centralisation Date")["Quantity - AUM"]
.sum()
.reindex(all_months)
.ffill()
.rename("q_total")
)
# ── Bucket flows helper (reuse same window logic) ─────────────
def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
fc = flows_df.copy()
def assign_month(d):
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
return (fc.groupby("month_end")["Quantity - NetFlows"]
.sum()
.reindex(months)
.fillna(0.0))
flow_std = bucket_total_flows(flows, all_months)
flow_lag = bucket_total_flows(flows, all_months,
lower_offset=lag_days, upper_offset=lag_days)
# ── Compute residuals ─────────────────────────────────────────
rows = []
min_abs_shares = 1.0
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
q_curr = aum_monthly.get(t_curr, np.nan)
q_prev = aum_monthly.get(t_prev, np.nan)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
f_std = flow_std.get(t_curr, 0.0)
f_lag = flow_lag.get(t_curr, 0.0)
miss_std = delta - f_std
miss_lag = delta - f_lag
movement_std = max(abs(delta), abs(f_std), min_abs_shares)
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
pct_std = abs(miss_std) / movement_std
pct_lag = abs(miss_lag) / movement_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
is_lag = broken_std and (not broken_lag)
rows.append({
"date": t_curr,
"q_total_prev": round(q_prev, 3),
"q_total_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_total": round(f_std, 3),
"missing_flow": round(miss_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
})
df_agg = pd.DataFrame(rows)
return df_agg
# ─────────────────────────────────────────────────────────────
# 3. PRINT SUMMARY
# ─────────────────────────────────────────────────────────────
def print_summary(df_broken, df_all, alpha):
total = len(df_all)
n_broken = len(df_broken)
n_lag = df_broken["is_lag"].sum()
print("\n" + "=" * 60)
print(" CARMIGNAC — Broken Months Diagnostics")
print("=" * 60)
print(f" (isin, month) pairs examined : {total}")
print(f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
f"({n_broken/total*100:.1f}%)")
print(f" Of which likely lag : {n_lag}")
print(f" Of which genuine gap : {n_broken - n_lag}")
if n_broken:
print("\n Top 10 by missing_pct:")
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
print(df_broken[cols].head(10).to_string(index=False))
# Monthly breakdown
by_month = (df_broken.groupby("date")
.agg(n_broken=("isin", "count"),
total_missing=("missing_flow", lambda x: x.abs().sum()))
.sort_values("n_broken", ascending=False)
.head(5))
if len(by_month):
print("\n Most affected months:")
print(by_month.to_string())
print()
# ─────────────────────────────────────────────────────────────
# 4. BUILD HTML REPORT
# ─────────────────────────────────────────────────────────────
def build_html(df_broken, df_all, df_agg, alpha):
# ── JS-ready data ────────────────────────────────────────────
# Timeline: n_broken and total_missing per month
tl = (df_all[df_all["broken"]]
.groupby("date")
.agg(n_broken=("isin", "count"),
total_missing=("missing_flow", lambda x: x.abs().sum()),
n_lag=("is_lag", "sum"))
.reindex(df_all["date"].sort_values().unique())
.fillna(0))
tl.index = pd.to_datetime(tl.index)
dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index])
def jf(arr, dec=4):
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
n_broken_js = jf(tl["n_broken"].values, 0)
total_miss_js = jf(tl["total_missing"].values)
n_lag_js = jf(tl["n_lag"].values, 0)
# Aggregate (cross-ISIN) JS data
agg_dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])])
agg_delta_js = jf(df_agg["delta_aum"].values)
agg_flow_js = jf(df_agg["flow_total"].values)
agg_missing_js = jf(df_agg["missing_flow"].values)
agg_pct_js = jf((df_agg["missing_pct"] * 100).values)
# Aggregate KPIs
n_agg_broken = int(df_agg["broken"].sum())
n_agg_lag = int(df_agg["is_lag"].sum())
n_agg_genuine = n_agg_broken - n_agg_lag
max_agg_pct = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0
# Aggregate detail table rows
agg_rows = []
for _, r in df_agg[df_agg["broken"]].iterrows():
lb = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
pc = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
mc = "miss-neg" if r["missing_flow"] < 0 else "miss-pos"
agg_rows.append(
f'<tr><td>{ds}</td>'
f'<td class="mono right">{r["q_total_prev"]:,.1f}</td>'
f'<td class="mono right">{r["q_total_curr"]:,.1f}</td>'
f'<td class="mono right">{r["flow_total"]:,.1f}</td>'
f'<td class="mono right {mc}">{r["missing_flow"]:+,.1f}</td>'
f'<td class="mono right {pc}">{r["missing_pct"]*100:.2f}%</td>'
f'<td>{lb}</td></tr>'
)
agg_detail_rows = "".join(agg_rows) if agg_rows else (
'<tr><td colspan="7" style="padding:24px;text-align:center;'
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
)
# Per-ISIN summary
isin_sum = (df_broken.groupby("isin")
.agg(n_months=("date", "count"),
avg_pct=("missing_pct", "mean"),
total_abs=("missing_flow", lambda x: x.abs().sum()))
.sort_values("total_abs", ascending=False))
ISIN_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
]
# Per-ISIN missing_pct timeseries for the top 5 ISINs
top_isins = isin_sum.head(5).index.tolist()
all_dates = sorted(df_all["date"].unique())
isin_ts_datasets = []
for idx, isin in enumerate(top_isins):
sub = df_all[df_all["isin"] == isin].set_index("date")["missing_pct"].reindex(all_dates).fillna(0)
isin_ts_datasets.append({
"label": isin,
"data": [round(float(v) * 100, 3) for v in sub.values],
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
"borderWidth": 2,
"pointRadius": 0,
"tension": 0.3,
"fill": False,
})
isin_ts_json = json.dumps(isin_ts_datasets)
all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
else str(d)[:10] for d in all_dates])
# Detail table rows
detail_rows = ""
for _, r in df_broken.head(200).iterrows():
lag_badge = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
detail_rows += f"""
<tr>
<td>{r['date'].strftime('%Y-%m-%d') if hasattr(r['date'], 'strftime') else str(r['date'])[:10]}</td>
<td class="mono">{r['isin']}</td>
<td class="mono right">{r['q_agg_prev']:,.1f}</td>
<td class="mono right">{r['q_agg_curr']:,.1f}</td>
<td class="mono right">{r['flow_agg']:,.1f}</td>
<td class="mono right {'miss-neg' if r['missing_flow'] < 0 else 'miss-pos'}">{r['missing_flow']:+,.1f}</td>
<td class="mono right {pct_class}">{r['missing_pct']*100:.2f}%</td>
<td>{lag_badge}</td>
</tr>"""
# ISIN summary table
isin_rows = ""
for isin, row in isin_sum.iterrows():
isin_rows += f"""
<tr>
<td class="mono">{isin}</td>
<td class="mono right">{int(row['n_months'])}</td>
<td class="mono right">{row['avg_pct']*100:.2f}%</td>
<td class="mono right">{row['total_abs']:,.1f}</td>
</tr>"""
# KPIs
total = len(df_all)
n_broken_kpi = len(df_broken)
n_lag_kpi = int(df_broken["is_lag"].sum())
n_genuine = n_broken_kpi - n_lag_kpi
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
n_isins = df_broken["isin"].nunique()
no_broken_msg = ""
if n_broken_kpi == 0:
no_broken_msg = '<div class="no-broken">✓ No broken months detected at this threshold.</div>'
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Carmignac Broken Months Diagnostics</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
<style>
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;600;700&display=swap');
:root {{
--bg: #0d0f12; --surface: #151820; --border: #252a35;
--accent: #3b82f6; --warn: #f59e0b; --danger: #ef4444;
--success: #10b981; --text: #e2e8f0; --muted: #64748b;
--mono: 'IBM Plex Mono', monospace;
--sans: 'IBM Plex Sans', sans-serif;
}}
*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: var(--sans); background: var(--bg); color: var(--text);
padding: 0 0 60px; }}
.header {{ background: linear-gradient(135deg,#0d1117,#111827,#1a0a0a);
border-bottom: 1px solid var(--border); padding: 40px 48px 36px; }}
.header-eyebrow {{ font-family: var(--mono); font-size: 11px; letter-spacing:.15em;
color: var(--danger); text-transform: uppercase; margin-bottom:10px; }}
.header h1 {{ font-size: 2rem; font-weight: 700; letter-spacing:-.02em; margin-bottom:8px; }}
.header-sub {{ font-size:.85rem; color: var(--muted); font-family: var(--mono); }}
.kpi-strip {{ display: grid; grid-template-columns: repeat(auto-fit,minmax(160px,1fr));
gap: 1px; background: var(--border); border-bottom: 1px solid var(--border); }}
.kpi {{ background: var(--surface); padding: 22px 28px;
display: flex; flex-direction: column; gap: 4px; }}
.kpi-label {{ font-size:.7rem; letter-spacing:.1em; text-transform:uppercase;
color: var(--muted); font-family: var(--mono); }}
.kpi-value {{ font-size:1.6rem; font-weight:700; font-family: var(--mono); line-height:1; }}
.kpi-value.danger {{ color: var(--danger); }}
.kpi-value.warn {{ color: var(--warn); }}
.kpi-value.success {{ color: var(--success); }}
.kpi-sub {{ font-size:.7rem; color: var(--muted); font-family: var(--mono); }}
.main {{ max-width:1400px; margin:0 auto; padding:36px 48px;
display:flex; flex-direction:column; gap:32px; }}
.card {{ background: var(--surface); border: 1px solid var(--border);
border-radius:8px; overflow:hidden; }}
.card-header {{ padding:18px 24px 14px; border-bottom:1px solid var(--border);
display:flex; align-items:baseline; gap:12px; }}
.card-title {{ font-size:.8rem; font-weight:600; letter-spacing:.1em;
text-transform:uppercase; color: var(--muted); font-family: var(--mono); }}
.card-desc {{ font-size:.78rem; color: #475569; }}
.card-body {{ padding:24px; }}
.chart-wrap {{ position:relative; height:260px; }}
.chart-wrap-tall {{ position:relative; height:320px; }}
.grid-2 {{ display:grid; grid-template-columns:1fr 1fr; gap:24px; }}
@media(max-width:900px) {{ .grid-2 {{ grid-template-columns:1fr; }}
.main {{ padding:24px 20px; }} }}
.section-label {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.15em;
text-transform:uppercase; color: var(--muted);
padding-left:10px; border-left:3px solid var(--danger);
margin-bottom:-8px; }}
table {{ width:100%; border-collapse:collapse; font-size:.82rem; }}
th {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.08em;
text-transform:uppercase; color: var(--muted); padding:10px 14px;
text-align:left; border-bottom:1px solid var(--border); background:#0f1218; }}
td {{ padding:10px 14px; border-bottom:1px solid #1a1f2a; vertical-align:middle; }}
tr:last-child td {{ border-bottom:none; }}
tr:hover td {{ background:#181e2b; }}
.mono {{ font-family: var(--mono); font-size:.78rem; }}
.right {{ text-align:right; }}
.miss-pos {{ color: var(--warn); }}
.miss-neg {{ color: var(--danger); }}
.pct-high {{ color: var(--danger); font-weight:600; }}
.pct-med {{ color: var(--warn); }}
.lag-badge {{ font-family: var(--mono); font-size:.65rem; padding:2px 6px;
background:#f59e0b22; border:1px solid #f59e0b66; border-radius:3px;
color: var(--warn); }}
.no-broken {{ padding:40px; text-align:center; color: var(--success);
font-family: var(--mono); font-size:.9rem; }}
.footer {{ text-align:center; font-family: var(--mono); font-size:.68rem;
color:#334155; margin-top:16px; letter-spacing:.05em; }}
.alpha-note {{ font-family: var(--mono); font-size:.75rem; color: var(--muted);
padding:10px 24px 0; }}
</style>
</head>
<body>
<div class="header">
<div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
<h1>Broken Months Diagnostics</h1>
<div class="header-sub">
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
<span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) capped at movement size, not stock level</span>
</div>
</div>
<div class="kpi-strip">
<div class="kpi">
<span class="kpi-label">(ISIN, month) pairs</span>
<span class="kpi-value">{total:,}</span>
<span class="kpi-sub">examined</span>
</div>
<div class="kpi">
<span class="kpi-label">Broken months</span>
<span class="kpi-value {'danger' if n_broken_kpi > 0 else 'success'}">{n_broken_kpi:,}</span>
<span class="kpi-sub">{n_broken_kpi/total*100:.1f}% of pairs</span>
</div>
<div class="kpi">
<span class="kpi-label">Likely lags</span>
<span class="kpi-value warn">{n_lag_kpi}</span>
<span class="kpi-sub">resolved by ±{3}d window</span>
</div>
<div class="kpi">
<span class="kpi-label">Genuine gaps</span>
<span class="kpi-value {'danger' if n_genuine > 0 else 'success'}">{n_genuine}</span>
<span class="kpi-sub">unresolved by lag fix</span>
</div>
<div class="kpi">
<span class="kpi-label">ISINs affected</span>
<span class="kpi-value">{n_isins}</span>
<span class="kpi-sub">distinct ISINs</span>
</div>
<div class="kpi">
<span class="kpi-label">Max missing %</span>
<span class="kpi-value {'danger' if max_pct > 10 else 'warn'}">{max_pct:.1f}%</span>
<span class="kpi-sub">worst single (isin, month)</span>
</div>
</div>
<div class="main">
<div class="section-label">00 · Aggregate view all ISINs combined</div>
<div class="card">
<div class="card-header">
<span class="card-title">Stock-flow equation total portfolio</span>
<span class="card-desc">
Σ Q(t) Σ Q(t1) vs Σ F(t) across all ISINs and accounts.
Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
</span>
</div>
<div class="card-body">
<div class="chart-wrap-tall"><canvas id="chartAggOverlay"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate missing flow over time</span>
<span class="card-desc">Σ Q(t) Σ Q(t1) Σ F(t) should be near zero every month</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartAggMissing"></canvas></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate missing % of movement</span>
<span class="card-desc">|missing| / max(|ΔAUM|, |flow|) months above α flagged in red</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartAggPct"></canvas></div>
</div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate broken months detail</span>
</div>
<div class="card-body" style="padding:0">
<table>
<thead><tr>
<th>Date</th>
<th class="right">Σ Q(t1)</th><th class="right">Σ Q(t)</th>
<th class="right">Σ Flow</th><th class="right">Missing</th>
<th class="right">Missing %</th><th></th>
</tr></thead>
<tbody>{agg_detail_rows}</tbody>
</table>
</div>
</div>
<div class="section-label">01 · Timeline per ISIN</div>
<div class="card">
<div class="card-header">
<span class="card-title">Broken (isin, month) pairs per month</span>
<span class="card-desc">Stacked: genuine gaps (red) vs likely accounting lags (amber)</span>
</div>
<div class="card-body">
<div class="chart-wrap-tall"><canvas id="chartTimeline"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Total absolute missing flow per month</span>
<span class="card-desc">Sum of |missing flow| across all broken ISINs</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartMissing"></canvas></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Missing % top 5 ISINs over time</span>
<span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN capped at movement size</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
</div>
</div>
</div>
<div class="section-label">02 · By ISIN</div>
<div class="card">
<div class="card-header">
<span class="card-title">ISIN summary most affected</span>
</div>
<div class="card-body" style="padding:0">
{'<div class="no-broken">No broken months detected.</div>' if n_broken_kpi == 0 else f"""
<table>
<thead><tr>
<th>ISIN</th><th>Broken months</th>
<th>Avg missing %</th><th>Total |missing| (shares)</th>
</tr></thead>
<tbody>{isin_rows}</tbody>
</table>"""}
</div>
</div>
<div class="section-label">03 · Detail log</div>
<div class="card">
<div class="card-header">
<span class="card-title">All broken (isin, month) pairs</span>
<span class="card-desc">
<span class="lag-badge">lag</span> = likely resolved by extending flow window ±3 days
</span>
</div>
<div class="alpha-note">Threshold α = {alpha:.1%} · showing up to 200 rows</div>
<div class="card-body" style="padding:0">
{'<div class="no-broken">✓ No broken months detected at this threshold.</div>' if n_broken_kpi == 0 else f"""
<table>
<thead><tr>
<th>Date</th><th>ISIN</th>
<th class="right">Q(t-1)</th><th class="right">Q(t)</th>
<th class="right">Net flow</th><th class="right">Missing</th>
<th class="right">Missing % of movement</th><th></th>
</tr></thead>
<tbody>{detail_rows}</tbody>
</table>"""}
</div>
</div>
</div>
<div class="footer">Generated by carmignac_diagnostics.py · Carmignac × ENSAE Data Challenge 2025</div>
<script>
Chart.defaults.color = '#64748b';
Chart.defaults.borderColor = '#1e2535';
Chart.defaults.font.family = "'IBM Plex Mono', monospace";
Chart.defaults.font.size = 11;
const DATES = {dates_str};
const N_BROKEN = {n_broken_js};
const N_LAG = {n_lag_js};
const TOT_MISS = {total_miss_js};
const ISIN_TS = {isin_ts_json};
const ALL_DATES = {all_dates_str};
function tip() {{
return {{
backgroundColor:'#0d1117', borderColor:'#252a35', borderWidth:1,
titleFont:{{family:"'IBM Plex Mono'"}}, bodyFont:{{family:"'IBM Plex Mono'"}}, padding:10
}};
}}
function xAxis() {{
return {{ type:'category', ticks:{{maxTicksLimit:10,maxRotation:0}},
grid:{{color:'#1a2030'}} }};
}}
function yAxis(label) {{
return {{ grid:{{color:'#1a2030'}},
title:{{display:!!label,text:label,color:'#475569'}} }};
}}
// n_genuine per month = N_BROKEN - N_LAG
const N_GENUINE = N_BROKEN.map((b,i) => b - (N_LAG[i]||0));
new Chart(document.getElementById('chartTimeline'), {{
type:'bar',
data:{{
labels: DATES,
datasets:[
{{ label:'Genuine gaps', data:N_GENUINE,
backgroundColor:'#ef444488', borderColor:'#ef4444', borderWidth:1, borderRadius:2 }},
{{ label:'Likely lags', data:N_LAG,
backgroundColor:'#f59e0b88', borderColor:'#f59e0b', borderWidth:1, borderRadius:2 }},
]
}},
options:{{
responsive:true, maintainAspectRatio:false,
interaction:{{mode:'index',intersect:false}},
plugins:{{
legend:{{position:'top',labels:{{boxWidth:12,padding:16}}}},
tooltip:tip()
}},
scales:{{ x:xAxis(), y:{{...yAxis('# (isin, month) pairs'), stacked:true}} }},
}}
}});
new Chart(document.getElementById('chartMissing'), {{
type:'bar',
data:{{
labels: DATES,
datasets:[{{ label:'|Missing flow| (shares)', data:TOT_MISS,
backgroundColor:'#dc262688', borderColor:'#dc2626',
borderWidth:1, borderRadius:2 }}]
}},
options:{{
responsive:true, maintainAspectRatio:false,
plugins:{{legend:{{display:false}}, tooltip:tip()}},
scales:{{ x:xAxis(), y:yAxis('Shares') }}
}}
}});
new Chart(document.getElementById('chartIsinTs'), {{
type:'line',
data:{{ labels:ALL_DATES, datasets:ISIN_TS }},
options:{{
responsive:true, maintainAspectRatio:false,
interaction:{{mode:'index',intersect:false}},
plugins:{{
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
tooltip:tip()
}},
scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
}}
}});
// Aggregate charts
const AGG_DATES = {agg_dates_str};
const AGG_DELTA = {agg_delta_js};
const AGG_FLOW = {agg_flow_js};
const AGG_MISSING = {agg_missing_js};
const AGG_PCT = {agg_pct_js};
const ALPHA = {alpha};
// Color each bar: red if broken, amber if lag, else subtle blue
const aggPctColors = AGG_PCT.map(v =>
Math.abs(v) > ALPHA * 100 ? '#ef444488' : '#3b82f622'
);
const aggPctBorders = AGG_PCT.map(v =>
Math.abs(v) > ALPHA * 100 ? '#ef4444' : '#3b82f655'
);
// Overlay: ΔAUM vs total flow
new Chart(document.getElementById('chartAggOverlay'), {{
type: 'line',
data: {{
labels: AGG_DATES,
datasets: [
{{ label: 'ΔAUM (Σ Q(t) Σ Q(t1))',
data: AGG_DELTA, borderColor: '#3b82f6', backgroundColor: '#3b82f622',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
{{ label: 'Σ Net flows recorded',
data: AGG_FLOW, borderColor: '#10b981', backgroundColor: '#10b98122',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
]
}},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{
legend: {{position:'top', labels:{{boxWidth:12, padding:16}}}},
tooltip: tip()
}},
scales: {{ x: xAxis(), y: yAxis('Shares') }}
}}
}});
// Missing flow bar
new Chart(document.getElementById('chartAggMissing'), {{
type: 'bar',
data: {{
labels: AGG_DATES,
datasets: [{{ label: 'Missing flow (shares)', data: AGG_MISSING,
backgroundColor: AGG_MISSING.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
borderColor: AGG_MISSING.map(v => v < 0 ? '#ef4444' : '#f59e0b'),
borderWidth: 1, borderRadius: 2 }}]
}},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{legend:{{display:false}}, tooltip: tip()}},
scales: {{ x: xAxis(), y: yAxis('Shares') }}
}}
}});
// Missing % bar, coloured by threshold
new Chart(document.getElementById('chartAggPct'), {{
type: 'bar',
data: {{
labels: AGG_DATES,
datasets: [{{ label: 'Missing % of movement', data: AGG_PCT,
backgroundColor: aggPctColors, borderColor: aggPctBorders,
borderWidth: 1, borderRadius: 2 }}]
}},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{
legend: {{display:false}},
tooltip: tip(),
annotation: {{}} // threshold line handled via color
}},
scales: {{ x: xAxis(), y: {{...yAxis('Missing (%)'), min: 0}} }}
}}
}});
</script>
</body>
</html>"""
return html
# ─────────────────────────────────────────────────────────────
# 5. MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="Detect broken months in Carmignac AUM/Flows data"
)
parser.add_argument("--aum", default="AUM_head.csv")
parser.add_argument("--flows", default="flows_head.csv")
parser.add_argument("--out", default="carmignac_broken_months.csv",
help="Machine-readable output (loaded by carmignac_repair.py)")
parser.add_argument("--html", default="carmignac_diagnostics.html")
parser.add_argument("--alpha", type=float, default=0.02,
help="Tolerance threshold (default 0.02 = 2%%)")
parser.add_argument("--lag", type=int, default=3,
help="Boundary days to test for accounting lag (default 3)")
args = parser.parse_args()
def resolve(p):
if os.path.exists(p): return p
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
if os.path.exists(alt): return alt
sys.exit(f"[ERROR] File not found: {p}")
print(f"[Load] AUM : {args.aum}")
print(f"[Load] Flows : {args.flows}")
aum, flows = load_data(resolve(args.aum), resolve(args.flows))
print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
print_summary(df_broken, df_all, args.alpha)
n_agg_broken = int(df_agg["broken"].sum())
print(f" Aggregate broken months : {n_agg_broken} "
f"(of which lags: {int(df_agg['is_lag'].sum())})")
# CSV output — this is what carmignac_repair.py will load
if len(df_broken):
df_broken.to_csv(args.out, index=False)
print(f"[Export] Broken months CSV → {args.out}")
else:
pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
print(f"[Export] No broken months — empty CSV → {args.out}")
html = build_html(df_broken, df_all, df_agg, args.alpha)
with open(args.html, "w", encoding="utf-8") as f:
f.write(html)
print(f"[Export] HTML report → {args.html}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,8 @@ import pandas as pd
# 1. LOAD & VALIDATE # 1. LOAD & VALIDATE
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
def load_outputs(scores_path, mapping_path, surgery_path): def load_outputs(scores_path, mapping_path, surgery_path,
err_isin_path=None, err_agg_path=None):
scores = pd.read_csv(scores_path, parse_dates=["date"]) scores = pd.read_csv(scores_path, parse_dates=["date"])
mapping = pd.read_csv(mapping_path, parse_dates=["date"]) mapping = pd.read_csv(mapping_path, parse_dates=["date"])
surgery = pd.read_csv(surgery_path, parse_dates=["date"]) surgery = pd.read_csv(surgery_path, parse_dates=["date"])
@ -40,9 +41,44 @@ def load_outputs(scores_path, mapping_path, surgery_path):
surgery["reg_orig"] = surgery["reg_orig"].astype(str) surgery["reg_orig"] = surgery["reg_orig"].astype(str)
surgery["reg_from"] = surgery["reg_from"].astype(str) surgery["reg_from"] = surgery["reg_from"].astype(str)
surgery["reg_to"] = surgery["reg_to"].astype(str) surgery["reg_to"] = surgery["reg_to"].astype(str)
if "lookback_months" not in surgery.columns:
surgery["lookback_months"] = 1 # backwards compat
return scores, mapping, surgery # Error account (optional)
err_isin = None
err_agg = None
if err_isin_path and os.path.exists(err_isin_path):
err_isin = pd.read_csv(err_isin_path, parse_dates=["date"])
err_isin["isin"] = err_isin["isin"].astype(str)
if err_agg_path and os.path.exists(err_agg_path):
err_agg = pd.read_csv(err_agg_path, parse_dates=["date"])
return scores, mapping, surgery, err_isin, err_agg
# ─────────────────────────────────────────────────────────────
# 1b. LOAD ERROR ACCOUNT (optional)
# ─────────────────────────────────────────────────────────────
def load_error_account(isin_path, agg_path):
"""
Loads the error account CSVs produced by carmignac_diagnostics.py.
Returns (df_err_isin, df_err_agg) or (None, None) if files not found.
"""
if not isin_path or not agg_path:
return None, None
try:
ei = pd.read_csv(isin_path, parse_dates=["date"])
ea = pd.read_csv(agg_path, parse_dates=["date"])
ei["isin"] = ei["isin"].astype(str)
print(f"[Load] error account (ISIN) : {len(ei)} rows, "
f"{ei['isin'].nunique()} ISINs")
print(f"[Load] error account (agg) : {len(ea)} rows")
return ei, ea
except Exception as e:
print(f"[WARN] Could not load error account: {e}")
return None, None
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
# 2. COMPUTE ANALYTICS # 2. COMPUTE ANALYTICS
@ -195,7 +231,7 @@ def print_summary(analytics, surgery):
# 4. BUILD HTML REPORT # 4. BUILD HTML REPORT
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
def build_html(analytics, surgery, scores, mapping): def build_html(analytics, surgery, scores, mapping, df_err_isin=None, df_err_agg=None):
tl = analytics["timeline"] tl = analytics["timeline"]
ss = analytics["surgery_stats"] ss = analytics["surgery_stats"]
piv = analytics["pivot"] piv = analytics["pivot"]
@ -257,14 +293,212 @@ def build_html(analytics, surgery, scores, mapping):
traj_json = json.dumps(traj_datasets) traj_json = json.dumps(traj_datasets)
# ── 4.2b Error account data (optional) ────────────────────
has_error = df_err_isin is not None and df_err_agg is not None
if has_error:
err_dates = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])]
err_agg_stock = [round(float(v), 3) if not pd.isna(v) else None
for v in df_err_agg["stock_error_agg"].values]
err_agg_res = [round(float(v), 3) if not pd.isna(v) else None
for v in df_err_agg["residual_agg"].values]
err_agg_pct = [round(float(v), 4) if not pd.isna(v) else None
for v in df_err_agg["stock_error_agg_pct"].values]
# Top 5 ISINs by max |stock error|
top_err = (df_err_isin.groupby("isin")["stock_error"]
.apply(lambda x: x.abs().max())
.nlargest(5).index.tolist())
all_err_dates = sorted(df_err_isin["date"].unique())
ERR_COLORS = ["#ef4444","#f59e0b","#8b5cf6","#06b6d4","#10b981"]
err_isin_ds = []
for idx, isin in enumerate(top_err):
sub = (df_err_isin[df_err_isin["isin"] == isin]
.set_index("date")["stock_error"]
.reindex(all_err_dates))
err_isin_ds.append({
"label": isin,
"data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values],
"borderColor": ERR_COLORS[idx % len(ERR_COLORS)],
"backgroundColor": ERR_COLORS[idx % len(ERR_COLORS)] + "22",
"borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False,
})
max_err_stock = float(df_err_agg["stock_error_agg"].abs().max())
max_err_pct = float(df_err_agg["stock_error_agg_pct"].max())
agg_std = float(df_err_agg["stock_error_agg"].std())
agg_mean = float(df_err_agg["stock_error_agg"].abs().mean())
stationarity = round(agg_std / max(agg_mean, 1e-9), 3)
err_dates_js = json.dumps(err_dates)
err_agg_stock_js = json.dumps(err_agg_stock)
err_agg_res_js = json.dumps(err_agg_res)
err_agg_pct_js = json.dumps(err_agg_pct)
err_isin_ds_js = json.dumps(err_isin_ds)
err_isin_dates_js = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime")
else str(d)[:10] for d in all_err_dates])
# ISIN detail table (top 100 worst)
err_rows = []
for _, r in (df_err_isin.assign(abs_s=df_err_isin["stock_error"].abs())
.sort_values("abs_s", ascending=False)
.head(100).iterrows()):
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
sc = "color:var(--danger)" if r["stock_error"] < 0 else "color:var(--warn)"
rc = "color:var(--danger)" if r["residual"] < 0 else "color:var(--warn)"
pch = "color:var(--danger);font-weight:600" if r["stock_error_pct"] > 5 else ("color:var(--warn)" if r["stock_error_pct"] > 1 else "")
err_rows.append(
f'<tr><td>{ds}</td>'
f'<td style="font-family:var(--mono)">{r["isin"]}</td>'
f'<td style="text-align:right;font-family:var(--mono);{rc}">{r["residual"]:+,.2f}</td>'
f'<td style="text-align:right;font-family:var(--mono);{sc}">{r["stock_error"]:+,.2f}</td>'
f'<td style="text-align:right;font-family:var(--mono);{pch}">{r["stock_error_pct"]:.3f}%</td>'
f'</tr>'
)
err_isin_detail = "".join(err_rows) if err_rows else (
'<tr><td colspan="5" style="padding:24px;text-align:center;color:var(--accent2)'
';font-family:var(--mono)">✓ Error account is flat</td></tr>'
)
# HTML block for error account section
err_section_html = f"""
<div class="section-label">06 · Error Account</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate error account stock</span>
<span class="card-desc">
Stock_error(t_ref) = 0. The stock absorbs unreconciled residuals going backwards.
A flat signal near zero = clean data. A drift = structural gap.
</span>
</div>
<div class="card-body" style="padding-bottom:8px">
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1px;background:var(--border);margin-bottom:20px">
<div style="background:var(--surface);padding:14px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max |error stock|</div>
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:var(--danger)">{max_err_stock:,.1f} shares</div>
</div>
<div style="background:var(--surface);padding:14px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max % of total AUM</div>
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:{'var(--danger)' if max_err_pct > 5 else 'var(--warn)'}">{max_err_pct:.3f}%</div>
</div>
<div style="background:var(--surface);padding:14px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Stationarity σ/μ</div>
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:{'var(--accent2)' if stationarity < 1 else 'var(--warn)'}">{stationarity:.3f}</div>
<div style="font-size:.7rem;color:var(--muted);font-family:var(--mono)">lower = more stationary</div>
</div>
</div>
<div class="chart-wrap-tall"><canvas id="chartErrStock"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Monthly aggregate residual</span>
<span class="card-desc">ΔQ_total F_total per month</span>
</div>
<div class="card-body"><div class="chart-wrap"><canvas id="chartErrRes"></canvas></div></div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Error stock top 5 ISINs</span>
<span class="card-desc">Cumulative error stock per ISIN</span>
</div>
<div class="card-body"><div class="chart-wrap"><canvas id="chartErrIsin"></canvas></div></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Error account detail worst (ISIN, month) pairs</span>
</div>
<div class="card-body" style="padding:0">
<table>
<thead><tr>
<th>Date</th><th>ISIN</th>
<th style="text-align:right">Monthly residual</th>
<th style="text-align:right">Cumul. stock</th>
<th style="text-align:right">% of max AUM</th>
</tr></thead>
<tbody>{err_isin_detail}</tbody>
</table>
</div>
</div>"""
# JS block for error account charts
err_js_block = f"""
// 8. Error account charts
const ERR_DATES = {err_dates_js};
const ERR_AGG_STOCK = {err_agg_stock_js};
const ERR_AGG_RES = {err_agg_res_js};
const ERR_ISIN_TS = {err_isin_ds_js};
const ERR_ISIN_DATES = {err_isin_dates_js};
new Chart(document.getElementById('chartErrStock'), {{
type: 'line',
data: {{ labels: ERR_DATES, datasets: [{{
label: 'Aggregate error stock', data: ERR_AGG_STOCK,
borderColor: '#ef4444', backgroundColor: '#ef444415',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true
}}] }},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
scales: {{ x: timeAxis(), y: {{
...yAxis('Shares'),
grid: {{ color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030',
lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1 }}
}} }}
}}
}});
new Chart(document.getElementById('chartErrRes'), {{
type: 'bar',
data: {{ labels: ERR_DATES, datasets: [{{
label: 'Monthly residual', data: ERR_AGG_RES,
backgroundColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef444488' : '#f59e0b88'),
borderColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef4444' : '#f59e0b'),
borderWidth: 1, borderRadius: 2
}}] }},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
scales: {{ x: timeAxis(), y: yAxis('Shares') }}
}}
}});
new Chart(document.getElementById('chartErrIsin'), {{
type: 'line',
data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{
legend: {{position:'right', labels:{{boxWidth:10, padding:8, font:{{size:10}}}}}},
tooltip: tooltip()
}},
scales: {{ x: timeAxis(), y: yAxis('Error stock (shares)') }}
}}
}});"""
else:
err_section_html = ""
err_js_block = ""
# ── 4.3 Surgery detail table rows ────────────────────────── # ── 4.3 Surgery detail table rows ──────────────────────────
sd = analytics["surgery_detail"].sort_values("date") sd = analytics["surgery_detail"].sort_values("date")
surg_rows_html = "" surg_rows_html = ""
if len(sd) == 0: if len(sd) == 0:
surg_rows_html = "<tr><td colspan='8' style='text-align:center;color:#888'>No surgeries performed</td></tr>" surg_rows_html = "<tr><td colspan='9' style='text-align:center;color:#888'>No surgeries performed</td></tr>"
else: else:
for _, r in sd.iterrows(): for _, r in sd.iterrows():
gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low" gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low"
lb = int(r.get("lookback_months", 1))
lb_cell = (f'<span style="font-family:var(--mono);font-size:.65rem;padding:1px 5px;'
f'border-radius:3px;background:#7c3aed22;border:1px solid #7c3aed55;'
f'color:#a78bfa">{lb}m</span>' if lb > 1 else "")
surg_rows_html += f""" surg_rows_html += f"""
<tr> <tr>
<td>{r['date'].date()}</td> <td>{r['date'].date()}</td>
@ -275,6 +509,7 @@ def build_html(analytics, surgery, scores, mapping):
<td>{r['jaccard_composite']:.4f}</td> <td>{r['jaccard_composite']:.4f}</td>
<td class="{gain_class}">+{r['gain_vs_no_surgery']:.6f}</td> <td class="{gain_class}">+{r['gain_vs_no_surgery']:.6f}</td>
<td>{r['gain_pct_of_score']:.1f}%</td> <td>{r['gain_pct_of_score']:.1f}%</td>
<td>{lb_cell}</td>
</tr>""" </tr>"""
# ── 4.4 Top accounts table ────────────────────────────────── # ── 4.4 Top accounts table ──────────────────────────────────
@ -857,6 +1092,7 @@ def build_html(analytics, surgery, scores, mapping):
<th>Jaccard</th> <th>Jaccard</th>
<th>Score gain</th> <th>Score gain</th>
<th>% of score</th> <th>% of score</th>
<th>Lookback</th>
</tr> </tr>
</thead> </thead>
<tbody>{surg_rows_html}</tbody> <tbody>{surg_rows_html}</tbody>
@ -887,6 +1123,9 @@ def build_html(analytics, surgery, scores, mapping):
</div> </div>
</div> </div>
{err_section_html}
</div><!-- /main --> </div><!-- /main -->
<div class="footer">Generated by carmignac_analysis.py · Carmignac × ENSAE Data Challenge 2025</div> <div class="footer">Generated by carmignac_analysis.py · Carmignac × ENSAE Data Challenge 2025</div>
@ -1297,6 +1536,7 @@ new Chart(document.getElementById('chartJaccard'), {{
}}, }},
}}, }},
}}); }});
{err_js_block}
</script> </script>
</body> </body>
</html>""" </html>"""
@ -1314,32 +1554,49 @@ def main():
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv") parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv") parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
parser.add_argument("--out", default="repair_results/carmignac_report.html") parser.add_argument("--out", default="repair_results/carmignac_report.html")
parser.add_argument("--error-account-isin", default=None,
dest="error_isin",
help="Path to carmignac_error_account.csv (optional)")
parser.add_argument("--error-account-agg", default=None,
dest="error_agg",
help="Path to carmignac_error_account_agg.csv (optional)")
args = parser.parse_args() args = parser.parse_args()
# Resolve paths relative to this script's directory if files not found # Resolve paths relative to this script's directory if files not found
base = os.path.dirname(os.path.abspath(__file__)) base = os.path.dirname(os.path.abspath(__file__))
def resolve(p): def resolve(p, required=True):
if p is None:
return None
if os.path.exists(p): if os.path.exists(p):
return p return p
alt = os.path.join(base, p) alt = os.path.join(base, p)
if os.path.exists(alt): if os.path.exists(alt):
return alt return alt
if required:
sys.exit(f"[ERROR] File not found: {p}") sys.exit(f"[ERROR] File not found: {p}")
print(f"[WARN] Optional file not found: {p}")
return None
scores_path = resolve(args.scores) scores_path = resolve(args.scores)
mapping_path = resolve(args.mapping) mapping_path = resolve(args.mapping)
surgery_path = resolve(args.surgery) surgery_path = resolve(args.surgery)
error_isin_path = resolve(args.error_isin, required=False)
error_agg_path = resolve(args.error_agg, required=False)
print(f"[Load] scores : {scores_path}") print(f"[Load] scores : {scores_path}")
print(f"[Load] mapping : {mapping_path}") print(f"[Load] mapping : {mapping_path}")
print(f"[Load] surgery : {surgery_path}") print(f"[Load] surgery : {surgery_path}")
scores, mapping, surgery = load_outputs(scores_path, mapping_path, surgery_path) scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs(
scores_path, mapping_path, surgery_path,
err_isin_path=error_isin_path, err_agg_path=error_agg_path
)
analytics = compute_analytics(scores, mapping, surgery) analytics = compute_analytics(scores, mapping, surgery)
print_summary(analytics, surgery) print_summary(analytics, surgery)
html = build_html(analytics, surgery, scores, mapping) html = build_html(analytics, surgery, scores, mapping,
df_err_isin=df_err_isin, df_err_agg=df_err_agg)
out_path = args.out out_path = args.out
with open(out_path, "w", encoding="utf-8") as f: with open(out_path, "w", encoding="utf-8") as f:

View File

@ -36,13 +36,14 @@ import json
import os import os
import sys import sys
from collections import defaultdict
import s3fs
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from collections import defaultdict
import s3fs
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
# 1. LOAD # 1. LOAD
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
@ -217,6 +218,8 @@ def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False) df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
return df_broken, df_all return df_broken, df_all
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS # 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
@ -319,6 +322,165 @@ def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
df_agg = pd.DataFrame(rows) df_agg = pd.DataFrame(rows)
return df_agg return df_agg
# ─────────────────────────────────────────────────────────────
# 2c. ERROR ACCOUNT
# ─────────────────────────────────────────────────────────────
def build_error_account(aum, flows, lag_days=3):
"""
Builds a synthetic "error account" that absorbs the stock-flow
residuals that cannot be explained by recorded flows.
Construction (backwards from t_ref):
Stock_error(t_ref) = 0 (by definition)
Stock_error(t-1) = Stock_error(t) - Residual(t)
where Residual(t) = [Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)] - Σ_r F_{r,s}(t)
for each ISIN s independently.
By construction, adding this error account to the AUM restores the
stock-flow equality at every (isin, month).
Also computes an aggregated error account (summed over all ISINs).
Returns
-------
df_err_isin : DataFrame with columns
(date, isin, residual, stock_error, stock_error_pct)
where stock_error_pct = stock_error / max(total_isin_aum, 1)
df_err_agg : DataFrame with columns
(date, residual_agg, stock_error_agg, stock_error_agg_pct)
"""
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── ISIN-level AUM panel (forward-filled) ────────────────────
aum_agg = (
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
.sum()
.reset_index()
.rename(columns={"Product - Isin": "isin",
"Centralisation Date": "date",
"Quantity - AUM": "qty"})
)
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty")
aum_pivot = aum_pivot.reindex(all_months).ffill()
# ── ISIN-level flow aggregation (standard window) ─────────────
def bucket_isin_flows(flows_df, months):
fc = flows_df.copy()
def assign_month(d):
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
if eom_prev < d <= m:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
return (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
.sum()
.unstack("Product - Isin")
.reindex(months)
.fillna(0.0))
flow_pivot = bucket_isin_flows(flows, all_months)
# ── Compute residuals per (isin, month) ───────────────────────
isins = aum_pivot.columns.tolist()
# residual[t] = delta_AUM[t] - flow[t]
residuals = {} # {isin: Series indexed by month}
for isin in isins:
res_series = {}
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
q_curr = aum_pivot[isin].get(t_curr, np.nan)
q_prev = aum_pivot[isin].get(t_prev, np.nan)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
f = (flow_pivot[isin].get(t_curr, 0.0)
if isin in flow_pivot.columns else 0.0)
res_series[t_curr] = delta - f
residuals[isin] = pd.Series(res_series)
# ── Build error stock backwards from t_ref ────────────────────
t_ref = all_months[-1]
rows_isin = []
for isin in isins:
res = residuals[isin]
# Maximum AUM for this ISIN (for normalisation)
max_aum = aum_pivot[isin].max()
if pd.isna(max_aum) or max_aum < 1:
max_aum = 1.0
# Propagate backwards: stock(t_ref) = 0
stock = 0.0
# Build dict keyed by date
stock_by_date = {t_ref: 0.0}
for i in range(len(all_months) - 2, -1, -1):
t_curr = all_months[i + 1]
t_prev = all_months[i]
r = res.get(t_curr, 0.0)
stock = stock - r
stock_by_date[t_prev] = stock
for t in all_months:
s = stock_by_date.get(t, np.nan)
r = res.get(t, 0.0)
rows_isin.append({
"date": t,
"isin": isin,
"residual": round(r, 4),
"stock_error": round(s, 4) if not pd.isna(s) else np.nan,
"stock_error_pct": round(abs(s) / max_aum * 100, 4)
if not pd.isna(s) else np.nan,
})
df_err_isin = pd.DataFrame(rows_isin).sort_values(["date", "isin"])
# ── Aggregated error account ──────────────────────────────────
# Total AUM across all ISINs at each month
total_aum_by_month = aum_pivot.sum(axis=1)
max_total_aum = total_aum_by_month.max()
if pd.isna(max_total_aum) or max_total_aum < 1:
max_total_aum = 1.0
# Aggregate residual = sum of ISIN residuals
agg_res = {}
for i in range(1, len(all_months)):
t_curr = all_months[i]
total_r = sum(residuals[isin].get(t_curr, 0.0) for isin in isins)
agg_res[t_curr] = total_r
agg_stock = 0.0
agg_stock_by_date = {t_ref: 0.0}
for i in range(len(all_months) - 2, -1, -1):
t_curr = all_months[i + 1]
t_prev = all_months[i]
agg_stock = agg_stock - agg_res.get(t_curr, 0.0)
agg_stock_by_date[t_prev] = agg_stock
rows_agg = []
for t in all_months:
s = agg_stock_by_date.get(t, np.nan)
r = agg_res.get(t, 0.0)
rows_agg.append({
"date": t,
"residual_agg": round(r, 4),
"stock_error_agg": round(s, 4) if not pd.isna(s) else np.nan,
"stock_error_agg_pct": round(abs(s) / max_total_aum * 100, 4)
if not pd.isna(s) else np.nan,
})
df_err_agg = pd.DataFrame(rows_agg).sort_values("date")
return df_err_isin, df_err_agg
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
# 3. PRINT SUMMARY # 3. PRINT SUMMARY
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
@ -358,7 +520,7 @@ def print_summary(df_broken, df_all, alpha):
# 4. BUILD HTML REPORT # 4. BUILD HTML REPORT
# ───────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────
def build_html(df_broken, df_all, df_agg, alpha): def build_html(df_broken, df_all, df_agg, df_err_isin, df_err_agg, alpha):
# ── JS-ready data ──────────────────────────────────────────── # ── JS-ready data ────────────────────────────────────────────
# Timeline: n_broken and total_missing per month # Timeline: n_broken and total_missing per month
tl = (df_all[df_all["broken"]] tl = (df_all[df_all["broken"]]
@ -374,6 +536,11 @@ def build_html(df_broken, df_all, df_agg, alpha):
def jf(arr, dec=4): def jf(arr, dec=4):
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr]) return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
ISIN_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
]
n_broken_js = jf(tl["n_broken"].values, 0) n_broken_js = jf(tl["n_broken"].values, 0)
total_miss_js = jf(tl["total_missing"].values) total_miss_js = jf(tl["total_missing"].values)
n_lag_js = jf(tl["n_lag"].values, 0) n_lag_js = jf(tl["n_lag"].values, 0)
@ -412,6 +579,65 @@ def build_html(df_broken, df_all, df_agg, alpha):
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>' 'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
) )
# ── Error account JS data ────────────────────────────────────
err_dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])])
err_agg_stock_js = jf(df_err_agg["stock_error_agg"].values)
err_agg_res_js = jf(df_err_agg["residual_agg"].values)
err_agg_pct_js = jf(df_err_agg["stock_error_agg_pct"].values)
# Top 5 ISINs by max absolute stock error
top_err_isins = (
df_err_isin.groupby("isin")["stock_error"]
.apply(lambda x: x.abs().max())
.nlargest(5).index.tolist()
)
all_err_dates = sorted(df_err_isin["date"].unique())
err_isin_datasets = []
for idx, isin in enumerate(top_err_isins):
sub = (df_err_isin[df_err_isin["isin"] == isin]
.set_index("date")["stock_error"]
.reindex(all_err_dates))
err_isin_datasets.append({
"label": isin,
"data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values],
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
"borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False,
})
err_isin_ts_json = json.dumps(err_isin_datasets)
err_isin_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime")
else str(d)[:10] for d in all_err_dates])
# Error account KPIs
max_agg_stock_err = float(df_err_agg["stock_error_agg"].abs().max())
max_agg_stock_pct = float(df_err_agg["stock_error_agg_pct"].max())
# Stationarity proxy: std / mean_abs (lower = more stationary)
agg_std = float(df_err_agg["stock_error_agg"].std())
agg_mean = float(df_err_agg["stock_error_agg"].abs().mean())
stationarity = round(agg_std / max(agg_mean, 1e-9), 3)
# Error account ISIN detail table (worst months per ISIN)
err_worst = (df_err_isin.assign(abs_stock=df_err_isin["stock_error"].abs())
.sort_values("abs_stock", ascending=False)
.head(200))
err_isin_rows = []
for _, r in err_worst.iterrows():
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
sc = "miss-neg" if r["stock_error"] < 0 else "miss-pos"
rc = "miss-neg" if r["residual"] < 0 else "miss-pos"
pch = "pct-high" if r["stock_error_pct"] > 5 else ("pct-med" if r["stock_error_pct"] > 1 else "")
err_isin_rows.append(
f'<tr><td>{ds}</td>'
f'<td class="mono">{r["isin"]}</td>'
f'<td class="mono right {rc}">{r["residual"]:+,.2f}</td>'
f'<td class="mono right {sc}">{r["stock_error"]:+,.2f}</td>'
f'<td class="mono right {pch}">{r["stock_error_pct"]:.3f}%</td></tr>'
)
err_isin_detail = "".join(err_isin_rows) if err_isin_rows else (
'<tr><td colspan="5" style="padding:24px;text-align:center;'
'color:var(--success);font-family:var(--mono)">✓ Error account is flat (no residuals)</td></tr>'
)
# Per-ISIN summary # Per-ISIN summary
isin_sum = (df_broken.groupby("isin") isin_sum = (df_broken.groupby("isin")
.agg(n_months=("date", "count"), .agg(n_months=("date", "count"),
@ -419,11 +645,6 @@ def build_html(df_broken, df_all, df_agg, alpha):
total_abs=("missing_flow", lambda x: x.abs().sum())) total_abs=("missing_flow", lambda x: x.abs().sum()))
.sort_values("total_abs", ascending=False)) .sort_values("total_abs", ascending=False))
ISIN_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
]
# Per-ISIN missing_pct timeseries for the top 5 ISINs # Per-ISIN missing_pct timeseries for the top 5 ISINs
top_isins = isin_sum.head(5).index.tolist() top_isins = isin_sum.head(5).index.tolist()
all_dates = sorted(df_all["date"].unique()) all_dates = sorted(df_all["date"].unique())
@ -618,7 +839,77 @@ def build_html(df_broken, df_all, df_agg, alpha):
<div class="main"> <div class="main">
<div class="section-label">00 · Aggregate view all ISINs combined</div> <div class="section-label">00 · Error account cumulative residuals</div>
<div class="card">
<div class="card-header">
<span class="card-title">Aggregate error account stock over time</span>
<span class="card-desc">
Stock_error(t_ref) = 0 by definition. At each prior month, the stock absorbs the residual
[ΔQ_total F_total]. A stationary signal near zero = clean data.
A drifting signal = structural data quality problem.
</span>
</div>
<div class="card-body" style="padding-bottom:8px">
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1px;background:var(--border);margin-bottom:20px">
<div style="background:var(--surface);padding:16px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max |stock error|</div>
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:var(--danger)">{max_agg_stock_err:,.1f} shares</div>
</div>
<div style="background:var(--surface);padding:16px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max % of total AUM</div>
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:{'var(--danger)' if max_agg_stock_pct > 5 else 'var(--warn)'}">{max_agg_stock_pct:.3f}%</div>
</div>
<div style="background:var(--surface);padding:16px 20px">
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Stationarity (σ/μ)</div>
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:{'var(--success)' if stationarity < 1 else 'var(--warn)'}">{stationarity:.3f}</div>
<div style="font-size:.7rem;color:var(--muted);font-family:var(--mono)">lower = more stationary</div>
</div>
</div>
<div class="chart-wrap-tall"><canvas id="chartErrAggStock"></canvas></div>
</div>
</div>
<div class="grid-2">
<div class="card">
<div class="card-header">
<span class="card-title">Monthly aggregate residual</span>
<span class="card-desc">ΔQ_total F_total per month (should be near zero)</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartErrAggRes"></canvas></div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Error stock top 5 ISINs</span>
<span class="card-desc">Cumulative error stock per ISIN (most affected)</span>
</div>
<div class="card-body">
<div class="chart-wrap"><canvas id="chartErrIsinTs"></canvas></div>
</div>
</div>
</div>
<div class="card">
<div class="card-header">
<span class="card-title">Error account detail worst (ISIN, month) pairs</span>
<span class="card-desc">Sorted by absolute cumulative error stock. stock_error_pct = |stock| / max(ISIN AUM)</span>
</div>
<div class="card-body" style="padding:0">
<table>
<thead><tr>
<th>Date</th><th>ISIN</th>
<th class="right">Monthly residual</th>
<th class="right">Cumulative stock</th>
<th class="right">% of max AUM</th>
</tr></thead>
<tbody>{err_isin_detail}</tbody>
</table>
</div>
</div>
<div class="section-label">01 · Aggregate view all ISINs combined</div>
<div class="card"> <div class="card">
<div class="card-header"> <div class="card-header">
@ -832,6 +1123,75 @@ new Chart(document.getElementById('chartIsinTs'), {{
}} }}
}}); }});
// Error account charts
const ERR_DATES = {err_dates_str};
const ERR_AGG_STOCK = {err_agg_stock_js};
const ERR_AGG_RES = {err_agg_res_js};
const ERR_AGG_PCT = {err_agg_pct_js};
const ERR_ISIN_TS = {err_isin_ts_json};
const ERR_ISIN_DATES= {err_isin_dates_str};
// Aggregate error stock line with zero reference
new Chart(document.getElementById('chartErrAggStock'), {{
type: 'line',
data: {{
labels: ERR_DATES,
datasets: [
{{ label: 'Aggregate error stock (shares)',
data: ERR_AGG_STOCK,
borderColor: '#ef4444', backgroundColor: '#ef444418',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true }},
]
}},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{ legend:{{display:false}}, tooltip: tip() }},
scales: {{
x: xAxis(),
y: {{
...yAxis('Shares'),
grid: {{
color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030',
lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1,
}}
}}
}}
}}
}});
// Monthly residual bar
new Chart(document.getElementById('chartErrAggRes'), {{
type: 'bar',
data: {{
labels: ERR_DATES,
datasets: [{{ label: 'Monthly residual (shares)', data: ERR_AGG_RES,
backgroundColor: ERR_AGG_RES.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
borderColor: ERR_AGG_RES.map(v => v < 0 ? '#ef4444' : '#f59e0b'),
borderWidth: 1, borderRadius: 2 }}]
}},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{legend:{{display:false}}, tooltip: tip()}},
scales: {{ x: xAxis(), y: yAxis('Shares') }}
}}
}});
// Per-ISIN error stock timeseries
new Chart(document.getElementById('chartErrIsinTs'), {{
type: 'line',
data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
tooltip: tip()
}},
scales: {{ x: xAxis(), y: yAxis('Error stock (shares)') }}
}}
}});
// Aggregate charts // Aggregate charts
const AGG_DATES = {agg_dates_str}; const AGG_DATES = {agg_dates_str};
const AGG_DELTA = {agg_delta_js}; const AGG_DELTA = {agg_delta_js};
@ -926,8 +1286,8 @@ def main():
parser.add_argument("--out", default="carmignac_broken_months.csv", parser.add_argument("--out", default="carmignac_broken_months.csv",
help="Machine-readable output (loaded by carmignac_repair.py)") help="Machine-readable output (loaded by carmignac_repair.py)")
parser.add_argument("--html", default="carmignac_diagnostics.html") parser.add_argument("--html", default="carmignac_diagnostics.html")
parser.add_argument("--alpha", type=float, default=0.15, parser.add_argument("--alpha", type=float, default=0.02,
help="Tolerance threshold (default 0.15 = 15%%)") help="Tolerance threshold (default 0.02 = 2%%)")
parser.add_argument("--lag", type=int, default=3, parser.add_argument("--lag", type=int, default=3,
help="Boundary days to test for accounting lag (default 3)") help="Boundary days to test for accounting lag (default 3)")
args = parser.parse_args() args = parser.parse_args()
@ -948,21 +1308,35 @@ def main():
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag) df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag) df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
print(f"\n[Error account] Building error account...")
df_err_isin, df_err_agg = build_error_account(aum, flows, lag_days=args.lag)
print_summary(df_broken, df_all, args.alpha) print_summary(df_broken, df_all, args.alpha)
n_agg_broken = int(df_agg["broken"].sum()) n_agg_broken = int(df_agg["broken"].sum())
print(f" Aggregate broken months : {n_agg_broken} " print(f" Aggregate broken months : {n_agg_broken} "
f"(of which lags: {int(df_agg['is_lag'].sum())})") f"(of which lags: {int(df_agg['is_lag'].sum())})")
max_err = float(df_err_agg["stock_error_agg"].abs().max())
print(f" Max aggregate error stock : {max_err:,.1f} shares "
f"({float(df_err_agg['stock_error_agg_pct'].max()):.3f}% of total AUM)")
# CSV output — this is what carmignac_repair.py will load # CSV output — this is what carmignac_repair.py will load
if len(df_broken): if len(df_broken):
df_broken.to_csv(args.out, index=False) df_broken.to_csv(args.out, index=False)
print(f"[Export] Broken months CSV → {args.out}") print(f"[Export] Broken months CSV → {args.out}")
else: else:
pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(args.out, index=False) pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
print(f"[Export] No broken months — empty CSV → {args.out}") print(f"[Export] No broken months — empty CSV → {args.out}")
html = build_html(df_broken, df_all, df_agg, args.alpha) # Error account CSV
err_out = args.out.replace("broken_months", "error_account")
df_err_isin.to_csv(err_out, index=False)
err_agg_out = err_out.replace("error_account", "error_account_agg")
df_err_agg.to_csv(err_agg_out, index=False)
print(f"[Export] Error account (ISIN) → {err_out}")
print(f"[Export] Error account (agg) → {err_agg_out}")
html = build_html(df_broken, df_all, df_agg, df_err_isin, df_err_agg, args.alpha)
with open(args.html, "w", encoding="utf-8") as f: with open(args.html, "w", encoding="utf-8") as f:
f.write(html) f.write(html)
print(f"[Export] HTML report → {args.html}") print(f"[Export] HTML report → {args.html}")

View File

@ -6,6 +6,7 @@ Carmignac Data Challenge — Registrar ID Repair Pipeline
Étape 3 : Chirurgie de code (matching 1-to-1) Étape 3 : Chirurgie de code (matching 1-to-1)
""" """
import os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
@ -19,6 +20,8 @@ ALPHA = 0.05 # tolérance réconciliation : 5% du stock à t
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
MAX_SURGERY_LOOKBACK = 6 # remonter jusqu'à 6 mois en arrière pour trouver un candidat
SYMMETRY_ATTENUATION = 0.05 # facteur d'atténuation si rupture symétrique détectée (cas 1/3)
# ── Broken months ────────────────────────────────────────────── # ── Broken months ──────────────────────────────────────────────
# Attenuation factor applied to reconciliation errors on months flagged # Attenuation factor applied to reconciliation errors on months flagged
@ -35,6 +38,15 @@ BROKEN_MONTH_ATTENUATION = 0.2 # reduce error to 20% on broken months
# attenuated (same factor as broken months). # attenuated (same factor as broken months).
LAG_ATTENUATION = 0.2 # reduce error to 20% on likely lag months LAG_ATTENUATION = 0.2 # reduce error to 20% on likely lag months
# ── Fenêtre de chirurgie étendue ───────────────────────────────
# Quand aucun bon candidat n'est trouvé à t-1, la chirurgie remonte
# jusqu'à MAX_SURGERY_LOOKBACK mois en arrière. Pour chaque mois k
# supplémentaire, le score composite est multiplié par un facteur de
# confiance décroissant : confidence(k) = 1 - (k-1)/MAX_SURGERY_LOOKBACK.
# Le client suggère 6 mois (délai maximal de résolution des transferts
# asymétriques, lié au cycle de paiement des rétrocessions trimestrielles).
MAX_SURGERY_LOOKBACK = 6
EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"] EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
@ -276,11 +288,69 @@ def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe
flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month'] flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month'] flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
# ── Pré-calcul des AUM agrégés par (isin, mois) pour détection de symétrie ──
# Pour chaque (isin, t), on calcule la somme des variations de stock par compte.
# Une rupture symétrique = un compte perd X parts sur un ISIN, un autre en gagne X.
# On détecte cela via le résidu net agrégé : si faible → symétrie probable.
# Structure : {(t_curr, isin) → {reg_id → delta_qty}}
# Calculé à la volée dans la boucle, pas en pré-calcul (trop mémoire pour 400 comptes).
# Remonter dans le temps # Remonter dans le temps
for i in range(len(all_months) - 2, -1, -1): for i in range(len(all_months) - 2, -1, -1):
t_prev = all_months[i] t_prev = all_months[i]
t_curr = all_months[i + 1] t_curr = all_months[i + 1]
# ── Détection de ruptures symétriques à ce pas de temps ──────
# Pour chaque ISIN, calculer la variation de stock par compte.
# Si la somme des variations positives ≈ somme des variations négatives
# → il y a probablement compensation (cas 1 ou 3, pas de perte nette).
# On stocke pour chaque (reg_id, isin) si sa rupture est symétrique.
symmetric_breaks = set() # ensemble de (reg_id, isin) à atténuer
for reg in panel.columns.get_level_values(0):
for isin in panel[reg].columns:
q_t = panel[reg][isin].get(t_curr, np.nan)
q_prev = panel[reg][isin].get(t_prev, np.nan)
if pd.isna(q_t) or pd.isna(q_prev):
continue
try:
f = flows_idx.loc[(t_curr, reg, isin)]
except KeyError:
f = 0.0
residual = (q_t - q_prev) - f
if abs(residual) < ALPHA * max(abs(q_t), abs(q_prev), 1e-9):
continue # pas de rupture sur ce compte/ISIN
# Agrégation par ISIN : si le résidu net agrégé est petit,
# les ruptures individuelles se compensent → symétrie.
isin_residuals = {}
isin_total_abs = {}
for reg in panel.columns.get_level_values(0):
for isin in panel[reg].columns:
q_t = panel[reg][isin].get(t_curr, np.nan)
q_prev = panel[reg][isin].get(t_prev, np.nan)
if pd.isna(q_t) or pd.isna(q_prev):
continue
try:
f = flows_idx.loc[(t_curr, reg, isin)]
except KeyError:
f = 0.0
residual = (q_t - q_prev) - f
denom = max(abs(q_t), abs(q_prev), 1e-9)
err = abs(residual) / denom
if err < ALPHA:
continue
isin_residuals[isin] = isin_residuals.get(isin, 0.0) + residual
isin_total_abs[isin] = isin_total_abs.get(isin, 0.0) + abs(residual)
# Un ISIN est "symétrique" si le résidu net < 20% du résidu brut total
# (les erreurs individuelles s'annulent en grande partie)
symmetric_isins = set()
for isin, net in isin_residuals.items():
total = isin_total_abs.get(isin, 0.0)
if total > 0 and abs(net) / total < 0.20:
symmetric_isins.add(isin)
errors_at_t = {} errors_at_t = {}
new_scores = {} new_scores = {}
@ -335,12 +405,15 @@ def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe
qty_t_prev, qty_t, net_flow, alpha=ALPHA qty_t_prev, qty_t, net_flow, alpha=ALPHA
) )
# ── Attenuation on broken / lag months ────────────── # ── Attenuation on broken / lag / symmetric months ───
# If this (isin, month) is flagged as broken at market # Priority: symmetric > broken > lag
# level, the error is not the account's fault — attenuate.
if err_ratio > 0: if err_ratio > 0:
key = (t_curr, isin) key = (t_curr, isin)
if key in broken_months or key in lag_months: if isin in symmetric_isins:
# Rupture compensée à l'agrégé → cas 1 ou 3,
# pas de perte nette de données → atténuation forte
err_ratio = err_ratio * SYMMETRY_ATTENUATION
elif key in broken_months or key in lag_months:
# Try lag-window flow to distinguish lag vs genuine gap # Try lag-window flow to distinguish lag vs genuine gap
try: try:
net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)] net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
@ -590,17 +663,26 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
# ── Candidats disponibles ── # ── Candidats disponibles ──
# On exclut les codes déjà mappés à un autre compte, # On exclut les codes déjà mappés à un autre compte,
# mais reg_curr lui-même est un candidat valide (self-mapping : # mais reg_curr lui-même est un candidat valide (self-mapping).
# le compte existait déjà sous ce code à t-1, dormant ou partiel).
available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr} available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
best_candidate = None best_candidate = None
best_score_after = score_prev_no_surgery # baseline = pas de chirurgie best_score_after = score_prev_no_surgery # baseline = pas de chirurgie
best_composite = 0.0 best_composite = 0.0
best_lookback = 0 # nombre de mois remontés pour trouver ce candidat
# ── Fenêtre de recherche étendue : jusqu'à MAX_SURGERY_LOOKBACK mois ──
# On cherche d'abord à t-1 (k=1), puis t-2 … t-MAX si rien trouvé.
# La confiance décroît avec la distance : confidence(k) = 1 - (k-1)/MAX
for k in range(1, MAX_SURGERY_LOOKBACK + 1):
if i - (k - 1) < 0:
break # on a atteint le début de l'historique
t_lookup = all_months[i - (k - 1)] # date candidate = t_prev - (k-1)
confidence = 1.0 - (k - 1) / MAX_SURGERY_LOOKBACK
for j in available: for j in available:
# Pré-filtre rapide : overlap ISIN minimal # Pré-filtre rapide : overlap ISIN minimal
isin_j = reg_isin_at_date.get(j, {}).get(t_prev, set()) isin_j = reg_isin_at_date.get(j, {}).get(t_lookup, set())
if not isin_curr or not isin_j: if not isin_curr or not isin_j:
continue continue
inter = len(isin_curr & isin_j) inter = len(isin_curr & isin_j)
@ -610,18 +692,27 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
if jac < MIN_JACCARD: if jac < MIN_JACCARD:
continue continue
# Score après chirurgie avec ce candidat # Score après chirurgie avec ce candidat à t_lookup
score_after = _recompute_score_with_candidate( # (on utilise t_curr comme référence de stock, t_lookup comme prior)
reg_curr, j, t_prev, t_curr, panel, flows_idx, score_curr score_after_raw = _recompute_score_with_candidate(
reg_curr, j, t_lookup, t_curr, panel, flows_idx, score_curr
) )
composite = jac * (score_after / score_curr) if score_curr > 0 else 0 # Appliquer le facteur de confiance lié à la distance temporelle
score_after = score_curr * confidence * (score_after_raw / score_curr) if score_curr > 0 else score_after_raw
composite = jac * confidence * (score_after_raw / score_curr) if score_curr > 0 else 0
if score_after > best_score_after: if score_after > best_score_after:
best_score_after = score_after best_score_after = score_after
best_candidate = j best_candidate = j
best_composite = composite best_composite = composite
best_lookback = k
# Si on a trouvé un bon candidat à cette distance, on s'arrête
if best_candidate is not None:
break
if best_candidate: if best_candidate:
lookback_note = f", lookback={best_lookback}m" if best_lookback > 1 else ""
surgery_log.append({ surgery_log.append({
'date': t_prev, 'date': t_prev,
'reg_orig': reg_orig, 'reg_orig': reg_orig,
@ -632,15 +723,15 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
'score_after': round(best_score_after, 6), 'score_after': round(best_score_after, 6),
'drop_without_surgery': round(drop_ratio, 4), 'drop_without_surgery': round(drop_ratio, 4),
'gain_vs_no_surgery': round(best_score_after - score_prev_no_surgery, 6), 'gain_vs_no_surgery': round(best_score_after - score_prev_no_surgery, 6),
'lookback_months': best_lookback,
}) })
print(f" 🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : " print(f" 🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
f"{reg_curr}{best_candidate} " f"{reg_curr}{best_candidate} "
f"(composite={best_composite:.3f}, " f"(composite={best_composite:.3f}, "
f"score {score_curr:.4f}{best_score_after:.4f})") f"score {score_curr:.4f}{best_score_after:.4f}"
f"{lookback_note})")
# Mise à jour mapping # Mise à jour mapping
# Si self-mapping (best_candidate == reg_curr), on ne touche pas
# mapping_inv car le code ne change pas — on met juste à jour le score.
if best_candidate != reg_curr: if best_candidate != reg_curr:
if reg_curr in mapping_inv: if reg_curr in mapping_inv:
del mapping_inv[reg_curr] del mapping_inv[reg_curr]
@ -681,7 +772,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score']) df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
if not df_scores.empty: if not df_scores.empty:
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False]) df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
df_scores.to_csv(f"repair_results/{out_prefix}_scores.csv", index=False) df_scores.to_csv(f"/mnt/user-data/outputs/{out_prefix}_scores.csv", index=False)
# Mapping history # Mapping history
rows_m = [] rows_m = []
@ -710,13 +801,13 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# 8. PIPELINE PRINCIPAL # 8. PIPELINE PRINCIPAL
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def run_pipeline(broken_months_path=None): def run_pipeline(aum_path, flows_path, broken_months_path=None):
print("=" * 60) print("=" * 60)
print("CARMIGNAC — Pipeline de réparation des Registrar IDs") print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
print("=" * 60) print("=" * 60)
# Chargement # Chargement
aum, flows = load_data() aum, flows = load_data(aum_path, flows_path)
# Broken months (optional — produced by carmignac_diagnostics.py) # Broken months (optional — produced by carmignac_diagnostics.py)
broken_months, lag_months = load_broken_months(broken_months_path) broken_months, lag_months = load_broken_months(broken_months_path)

View File

@ -0,0 +1,44 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "5c8fc6c5",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"\n",
"def push_file(local_path, s3_path):\n",
" fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://' + 'minio-simple.lab.groupe-genes.fr'},\n",
" key=os.environ[\"AWS_ACCESS_KEY_ID\"],\n",
" secret=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n",
" token=os.environ[\"AWS_SESSION_TOKEN\"]\n",
" )\n",
"\n",
" with open(local_path, 'rb') as local_f, fs.open(s3_path, 'wb') as s3_f:\n",
" s3_f.write(local_f.read())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d43b725e",
"metadata": {},
"outputs": [],
"source": [
"push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')\n",
"push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long