Repairing
This commit is contained in:
parent
9f7aba7b2e
commit
f7ebe15534
4211
carmignac_diagnostics.html
Normal file
4211
carmignac_diagnostics.html
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1,702 +0,0 @@
|
|||
"""
|
||||
Carmignac Data Challenge — Broken Months Diagnostics
|
||||
=====================================================
|
||||
Detects months where the aggregate stock-flow equation is violated
|
||||
at the ISIN level (across all accounts):
|
||||
|
||||
Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1) ≠ Σ_r F_{r,s}(t-1→t)
|
||||
|
||||
The residual is the "missing flow":
|
||||
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
||||
|
||||
This is a market-level check, independent of individual account identity.
|
||||
It captures:
|
||||
- Genuinely missing flow records
|
||||
- End-of-month accounting lags (transactions dated at boundary)
|
||||
- Corporate actions (dividends, splits) not reflected in flows
|
||||
|
||||
Outputs
|
||||
-------
|
||||
carmignac_broken_months.csv — machine-readable, loaded by carmignac_repair.py
|
||||
carmignac_diagnostics.html — interactive HTML report
|
||||
|
||||
Usage
|
||||
-----
|
||||
python carmignac_diagnostics.py
|
||||
python carmignac_diagnostics.py \\
|
||||
--aum raw_AUM.csv \\
|
||||
--flows raw_flows.csv \\
|
||||
--out carmignac_broken_months.csv \\
|
||||
--html carmignac_diagnostics.html \\
|
||||
--alpha 0.02
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
import s3fs
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# 1. LOAD
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
def load_data():
|
||||
fs = s3fs.S3FileSystem(
|
||||
client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
|
||||
key = os.environ["AWS_ACCESS_KEY_ID"],
|
||||
secret = os.environ["AWS_SECRET_ACCESS_KEY"],
|
||||
token = os.environ["AWS_SESSION_TOKEN"])
|
||||
|
||||
with fs.open('projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv', 'rb') as f:
|
||||
flows = pd.read_csv(f, sep=";")
|
||||
|
||||
with fs.open('projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:
|
||||
aum = pd.read_csv(f, sep=";")
|
||||
|
||||
aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
|
||||
flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
|
||||
|
||||
return aum, flows
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# 2. AGGREGATE AND DETECT BROKEN MONTHS
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
|
||||
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||
"""
|
||||
For each (isin, month-end t), compute:
|
||||
- Q_agg(t) : total shares held across all accounts
|
||||
- Q_agg(t-1) : idem previous month (forward-filled)
|
||||
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
|
||||
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
||||
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
|
||||
|
||||
A month is flagged as "broken" when missing_pct > alpha.
|
||||
|
||||
Additionally, a month is flagged as a potential "lag" when:
|
||||
- It is broken with the standard window
|
||||
- But would NOT be broken if flows dated within lag_days of EOM
|
||||
are shifted to the adjacent month
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
|
||||
lag_days : number of boundary days to test for accounting lag
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
|
||||
df_all : Full DataFrame including non-broken months (for plotting)
|
||||
"""
|
||||
# Monthly calendar
|
||||
t_min = aum["Centralisation Date"].min()
|
||||
t_max = aum["Centralisation Date"].max()
|
||||
all_months = pd.date_range(t_min, t_max, freq="ME")
|
||||
|
||||
# ── Aggregate AUM per (isin, month-end) ──────────────────────
|
||||
aum_agg = (
|
||||
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
|
||||
.sum()
|
||||
.reset_index()
|
||||
.rename(columns={"Product - Isin": "isin",
|
||||
"Centralisation Date": "date",
|
||||
"Quantity - AUM": "qty_agg"})
|
||||
)
|
||||
# Forward-fill sparse panel
|
||||
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
|
||||
aum_pivot = aum_pivot.reindex(all_months).ffill()
|
||||
|
||||
# ── Aggregate flows per (isin, month-end) — standard window ──
|
||||
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
||||
"""Aggregate flows with optional boundary extension (in days)."""
|
||||
fc = flows_df.copy()
|
||||
def assign_month(d):
|
||||
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
|
||||
for m in months:
|
||||
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
||||
hi = m + pd.Timedelta(days=upper_offset)
|
||||
if lo < d <= hi:
|
||||
return m
|
||||
return pd.NaT
|
||||
|
||||
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
||||
fc = fc.dropna(subset=["month_end"])
|
||||
agg = (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
|
||||
.sum()
|
||||
.reset_index()
|
||||
.rename(columns={"Product - Isin": "isin",
|
||||
"month_end": "date",
|
||||
"Quantity - NetFlows": "flow_agg"}))
|
||||
return agg
|
||||
|
||||
flows_std = bucket_flows(flows, all_months)
|
||||
flows_lag = bucket_flows(flows, all_months,
|
||||
lower_offset=lag_days,
|
||||
upper_offset=lag_days)
|
||||
|
||||
def flows_to_pivot(df, months):
|
||||
piv = df.pivot(index="date", columns="isin", values="flow_agg")
|
||||
return piv.reindex(months).fillna(0.0)
|
||||
|
||||
fpiv_std = flows_to_pivot(flows_std, all_months)
|
||||
fpiv_lag = flows_to_pivot(flows_lag, all_months)
|
||||
|
||||
# ── Compute residuals ─────────────────────────────────────────
|
||||
rows = []
|
||||
isins = aum_pivot.columns.tolist()
|
||||
|
||||
for i in range(1, len(all_months)):
|
||||
t_curr = all_months[i]
|
||||
t_prev = all_months[i - 1]
|
||||
|
||||
for isin in isins:
|
||||
q_curr = aum_pivot[isin].get(t_curr, np.nan) if isin in aum_pivot.columns else np.nan
|
||||
q_prev = aum_pivot[isin].get(t_prev, np.nan) if isin in aum_pivot.columns else np.nan
|
||||
|
||||
if pd.isna(q_curr) or pd.isna(q_prev):
|
||||
continue
|
||||
|
||||
delta = q_curr - q_prev
|
||||
|
||||
# Standard window
|
||||
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
|
||||
missing_std = delta - f_std
|
||||
|
||||
# Extended lag window
|
||||
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
|
||||
missing_lag = delta - f_lag
|
||||
|
||||
# ── Denominator choice ────────────────────────────────
|
||||
# Normalise by the size of the *movement* (max of delta_AUM
|
||||
# and recorded flow), not by the stock level. This avoids
|
||||
# astronomically large percentages when a position is tiny
|
||||
# but the missing flow is a normal-sized number.
|
||||
#
|
||||
# Interpretation: "what fraction of the expected movement
|
||||
# is unaccounted for?" 100% = the entire movement is missing.
|
||||
#
|
||||
# A minimum absolute threshold (min_abs_shares) suppresses
|
||||
# noise from residual micro-positions (rounding artefacts).
|
||||
min_abs_shares = 1.0 # ignore positions smaller than 1 share
|
||||
movement = max(abs(delta), abs(f_std), min_abs_shares)
|
||||
denom_std = movement
|
||||
|
||||
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
||||
denom_lag = movement_lag
|
||||
|
||||
pct_std = abs(missing_std) / denom_std
|
||||
pct_lag = abs(missing_lag) / denom_lag
|
||||
|
||||
broken_std = pct_std > alpha
|
||||
broken_lag = pct_lag > alpha
|
||||
|
||||
# A "lag" month: broken with standard, NOT broken with extended window
|
||||
is_lag = broken_std and (not broken_lag)
|
||||
|
||||
rows.append({
|
||||
"date": t_curr,
|
||||
"isin": isin,
|
||||
"q_agg_prev": round(q_prev, 3),
|
||||
"q_agg_curr": round(q_curr, 3),
|
||||
"delta_aum": round(delta, 3),
|
||||
"flow_agg": round(f_std, 3),
|
||||
"missing_flow": round(missing_std, 3),
|
||||
"missing_pct": round(pct_std, 6),
|
||||
"broken": broken_std,
|
||||
"is_lag": is_lag,
|
||||
})
|
||||
|
||||
df_all = pd.DataFrame(rows)
|
||||
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
||||
return df_broken, df_all
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# 3. PRINT SUMMARY
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
|
||||
def print_summary(df_broken, df_all, alpha):
|
||||
total = len(df_all)
|
||||
n_broken = len(df_broken)
|
||||
n_lag = df_broken["is_lag"].sum()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CARMIGNAC — Broken Months Diagnostics")
|
||||
print("=" * 60)
|
||||
print(f" (isin, month) pairs examined : {total}")
|
||||
print(f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
|
||||
f"({n_broken/total*100:.1f}%)")
|
||||
print(f" Of which likely lag : {n_lag}")
|
||||
print(f" Of which genuine gap : {n_broken - n_lag}")
|
||||
|
||||
if n_broken:
|
||||
print("\n Top 10 by missing_pct:")
|
||||
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
|
||||
print(df_broken[cols].head(10).to_string(index=False))
|
||||
|
||||
# Monthly breakdown
|
||||
by_month = (df_broken.groupby("date")
|
||||
.agg(n_broken=("isin", "count"),
|
||||
total_missing=("missing_flow", lambda x: x.abs().sum()))
|
||||
.sort_values("n_broken", ascending=False)
|
||||
.head(5))
|
||||
if len(by_month):
|
||||
print("\n Most affected months:")
|
||||
print(by_month.to_string())
|
||||
print()
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# 4. BUILD HTML REPORT
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
|
||||
def build_html(df_broken, df_all, alpha):
|
||||
# ── JS-ready data ────────────────────────────────────────────
|
||||
# Timeline: n_broken and total_missing per month
|
||||
tl = (df_all[df_all["broken"]]
|
||||
.groupby("date")
|
||||
.agg(n_broken=("isin", "count"),
|
||||
total_missing=("missing_flow", lambda x: x.abs().sum()),
|
||||
n_lag=("is_lag", "sum"))
|
||||
.reindex(df_all["date"].sort_values().unique())
|
||||
.fillna(0))
|
||||
tl.index = pd.to_datetime(tl.index)
|
||||
dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index])
|
||||
|
||||
def jf(arr, dec=4):
|
||||
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
|
||||
|
||||
n_broken_js = jf(tl["n_broken"].values, 0)
|
||||
total_miss_js = jf(tl["total_missing"].values)
|
||||
n_lag_js = jf(tl["n_lag"].values, 0)
|
||||
|
||||
# Per-ISIN summary
|
||||
isin_sum = (df_broken.groupby("isin")
|
||||
.agg(n_months=("date", "count"),
|
||||
avg_pct=("missing_pct", "mean"),
|
||||
total_abs=("missing_flow", lambda x: x.abs().sum()))
|
||||
.sort_values("total_abs", ascending=False))
|
||||
|
||||
ISIN_COLORS = [
|
||||
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
|
||||
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
|
||||
]
|
||||
|
||||
# Per-ISIN missing_pct timeseries for the top 5 ISINs
|
||||
top_isins = isin_sum.head(5).index.tolist()
|
||||
all_dates = sorted(df_all["date"].unique())
|
||||
isin_ts_datasets = []
|
||||
for idx, isin in enumerate(top_isins):
|
||||
sub = df_all[df_all["isin"] == isin].set_index("date")["missing_pct"].reindex(all_dates).fillna(0)
|
||||
isin_ts_datasets.append({
|
||||
"label": isin,
|
||||
"data": [round(float(v) * 100, 3) for v in sub.values],
|
||||
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
|
||||
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
|
||||
"borderWidth": 2,
|
||||
"pointRadius": 0,
|
||||
"tension": 0.3,
|
||||
"fill": False,
|
||||
})
|
||||
isin_ts_json = json.dumps(isin_ts_datasets)
|
||||
all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
|
||||
else str(d)[:10] for d in all_dates])
|
||||
|
||||
# Detail table rows
|
||||
detail_rows = ""
|
||||
for _, r in df_broken.head(200).iterrows():
|
||||
lag_badge = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
|
||||
pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
|
||||
detail_rows += f"""
|
||||
<tr>
|
||||
<td>{r['date'].strftime('%Y-%m-%d') if hasattr(r['date'], 'strftime') else str(r['date'])[:10]}</td>
|
||||
<td class="mono">{r['isin']}</td>
|
||||
<td class="mono right">{r['q_agg_prev']:,.1f}</td>
|
||||
<td class="mono right">{r['q_agg_curr']:,.1f}</td>
|
||||
<td class="mono right">{r['flow_agg']:,.1f}</td>
|
||||
<td class="mono right {'miss-neg' if r['missing_flow'] < 0 else 'miss-pos'}">{r['missing_flow']:+,.1f}</td>
|
||||
<td class="mono right {pct_class}">{r['missing_pct']*100:.2f}%</td>
|
||||
<td>{lag_badge}</td>
|
||||
</tr>"""
|
||||
|
||||
# ISIN summary table
|
||||
isin_rows = ""
|
||||
for isin, row in isin_sum.iterrows():
|
||||
isin_rows += f"""
|
||||
<tr>
|
||||
<td class="mono">{isin}</td>
|
||||
<td class="mono right">{int(row['n_months'])}</td>
|
||||
<td class="mono right">{row['avg_pct']*100:.2f}%</td>
|
||||
<td class="mono right">{row['total_abs']:,.1f}</td>
|
||||
</tr>"""
|
||||
|
||||
# KPIs
|
||||
total = len(df_all)
|
||||
n_broken_kpi = len(df_broken)
|
||||
n_lag_kpi = int(df_broken["is_lag"].sum())
|
||||
n_genuine = n_broken_kpi - n_lag_kpi
|
||||
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
|
||||
n_isins = df_broken["isin"].nunique()
|
||||
|
||||
no_broken_msg = ""
|
||||
if n_broken_kpi == 0:
|
||||
no_broken_msg = '<div class="no-broken">✓ No broken months detected at this threshold.</div>'
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>Carmignac — Broken Months Diagnostics</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
|
||||
<style>
|
||||
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;600;700&display=swap');
|
||||
|
||||
:root {{
|
||||
--bg: #0d0f12; --surface: #151820; --border: #252a35;
|
||||
--accent: #3b82f6; --warn: #f59e0b; --danger: #ef4444;
|
||||
--success: #10b981; --text: #e2e8f0; --muted: #64748b;
|
||||
--mono: 'IBM Plex Mono', monospace;
|
||||
--sans: 'IBM Plex Sans', sans-serif;
|
||||
}}
|
||||
*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
||||
body {{ font-family: var(--sans); background: var(--bg); color: var(--text);
|
||||
padding: 0 0 60px; }}
|
||||
|
||||
.header {{ background: linear-gradient(135deg,#0d1117,#111827,#1a0a0a);
|
||||
border-bottom: 1px solid var(--border); padding: 40px 48px 36px; }}
|
||||
.header-eyebrow {{ font-family: var(--mono); font-size: 11px; letter-spacing:.15em;
|
||||
color: var(--danger); text-transform: uppercase; margin-bottom:10px; }}
|
||||
.header h1 {{ font-size: 2rem; font-weight: 700; letter-spacing:-.02em; margin-bottom:8px; }}
|
||||
.header-sub {{ font-size:.85rem; color: var(--muted); font-family: var(--mono); }}
|
||||
|
||||
.kpi-strip {{ display: grid; grid-template-columns: repeat(auto-fit,minmax(160px,1fr));
|
||||
gap: 1px; background: var(--border); border-bottom: 1px solid var(--border); }}
|
||||
.kpi {{ background: var(--surface); padding: 22px 28px;
|
||||
display: flex; flex-direction: column; gap: 4px; }}
|
||||
.kpi-label {{ font-size:.7rem; letter-spacing:.1em; text-transform:uppercase;
|
||||
color: var(--muted); font-family: var(--mono); }}
|
||||
.kpi-value {{ font-size:1.6rem; font-weight:700; font-family: var(--mono); line-height:1; }}
|
||||
.kpi-value.danger {{ color: var(--danger); }}
|
||||
.kpi-value.warn {{ color: var(--warn); }}
|
||||
.kpi-value.success {{ color: var(--success); }}
|
||||
.kpi-sub {{ font-size:.7rem; color: var(--muted); font-family: var(--mono); }}
|
||||
|
||||
.main {{ max-width:1400px; margin:0 auto; padding:36px 48px;
|
||||
display:flex; flex-direction:column; gap:32px; }}
|
||||
|
||||
.card {{ background: var(--surface); border: 1px solid var(--border);
|
||||
border-radius:8px; overflow:hidden; }}
|
||||
.card-header {{ padding:18px 24px 14px; border-bottom:1px solid var(--border);
|
||||
display:flex; align-items:baseline; gap:12px; }}
|
||||
.card-title {{ font-size:.8rem; font-weight:600; letter-spacing:.1em;
|
||||
text-transform:uppercase; color: var(--muted); font-family: var(--mono); }}
|
||||
.card-desc {{ font-size:.78rem; color: #475569; }}
|
||||
.card-body {{ padding:24px; }}
|
||||
.chart-wrap {{ position:relative; height:260px; }}
|
||||
.chart-wrap-tall {{ position:relative; height:320px; }}
|
||||
|
||||
.grid-2 {{ display:grid; grid-template-columns:1fr 1fr; gap:24px; }}
|
||||
@media(max-width:900px) {{ .grid-2 {{ grid-template-columns:1fr; }}
|
||||
.main {{ padding:24px 20px; }} }}
|
||||
|
||||
.section-label {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.15em;
|
||||
text-transform:uppercase; color: var(--muted);
|
||||
padding-left:10px; border-left:3px solid var(--danger);
|
||||
margin-bottom:-8px; }}
|
||||
|
||||
table {{ width:100%; border-collapse:collapse; font-size:.82rem; }}
|
||||
th {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.08em;
|
||||
text-transform:uppercase; color: var(--muted); padding:10px 14px;
|
||||
text-align:left; border-bottom:1px solid var(--border); background:#0f1218; }}
|
||||
td {{ padding:10px 14px; border-bottom:1px solid #1a1f2a; vertical-align:middle; }}
|
||||
tr:last-child td {{ border-bottom:none; }}
|
||||
tr:hover td {{ background:#181e2b; }}
|
||||
.mono {{ font-family: var(--mono); font-size:.78rem; }}
|
||||
.right {{ text-align:right; }}
|
||||
.miss-pos {{ color: var(--warn); }}
|
||||
.miss-neg {{ color: var(--danger); }}
|
||||
.pct-high {{ color: var(--danger); font-weight:600; }}
|
||||
.pct-med {{ color: var(--warn); }}
|
||||
.lag-badge {{ font-family: var(--mono); font-size:.65rem; padding:2px 6px;
|
||||
background:#f59e0b22; border:1px solid #f59e0b66; border-radius:3px;
|
||||
color: var(--warn); }}
|
||||
.no-broken {{ padding:40px; text-align:center; color: var(--success);
|
||||
font-family: var(--mono); font-size:.9rem; }}
|
||||
|
||||
.footer {{ text-align:center; font-family: var(--mono); font-size:.68rem;
|
||||
color:#334155; margin-top:16px; letter-spacing:.05em; }}
|
||||
.alpha-note {{ font-family: var(--mono); font-size:.75rem; color: var(--muted);
|
||||
padding:10px 24px 0; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="header">
|
||||
<div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
|
||||
<h1>Broken Months Diagnostics</h1>
|
||||
<div class="header-sub">
|
||||
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
|
||||
<span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="kpi-strip">
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">(ISIN, month) pairs</span>
|
||||
<span class="kpi-value">{total:,}</span>
|
||||
<span class="kpi-sub">examined</span>
|
||||
</div>
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">Broken months</span>
|
||||
<span class="kpi-value {'danger' if n_broken_kpi > 0 else 'success'}">{n_broken_kpi:,}</span>
|
||||
<span class="kpi-sub">{n_broken_kpi/total*100:.1f}% of pairs</span>
|
||||
</div>
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">Likely lags</span>
|
||||
<span class="kpi-value warn">{n_lag_kpi}</span>
|
||||
<span class="kpi-sub">resolved by ±{3}d window</span>
|
||||
</div>
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">Genuine gaps</span>
|
||||
<span class="kpi-value {'danger' if n_genuine > 0 else 'success'}">{n_genuine}</span>
|
||||
<span class="kpi-sub">unresolved by lag fix</span>
|
||||
</div>
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">ISINs affected</span>
|
||||
<span class="kpi-value">{n_isins}</span>
|
||||
<span class="kpi-sub">distinct ISINs</span>
|
||||
</div>
|
||||
<div class="kpi">
|
||||
<span class="kpi-label">Max missing %</span>
|
||||
<span class="kpi-value {'danger' if max_pct > 10 else 'warn'}">{max_pct:.1f}%</span>
|
||||
<span class="kpi-sub">worst single (isin, month)</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="main">
|
||||
|
||||
<div class="section-label">01 · Timeline</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">Broken (isin, month) pairs per month</span>
|
||||
<span class="card-desc">Stacked: genuine gaps (red) vs likely accounting lags (amber)</span>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="chart-wrap-tall"><canvas id="chartTimeline"></canvas></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid-2">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">Total absolute missing flow per month</span>
|
||||
<span class="card-desc">Sum of |missing flow| across all broken ISINs</span>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="chart-wrap"><canvas id="chartMissing"></canvas></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">Missing % — top 5 ISINs over time</span>
|
||||
<span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size</span>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section-label">02 · By ISIN</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">ISIN summary — most affected</span>
|
||||
</div>
|
||||
<div class="card-body" style="padding:0">
|
||||
{'<div class="no-broken">No broken months detected.</div>' if n_broken_kpi == 0 else f"""
|
||||
<table>
|
||||
<thead><tr>
|
||||
<th>ISIN</th><th>Broken months</th>
|
||||
<th>Avg missing %</th><th>Total |missing| (shares)</th>
|
||||
</tr></thead>
|
||||
<tbody>{isin_rows}</tbody>
|
||||
</table>"""}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section-label">03 · Detail log</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">All broken (isin, month) pairs</span>
|
||||
<span class="card-desc">
|
||||
<span class="lag-badge">lag</span> = likely resolved by extending flow window ±3 days
|
||||
</span>
|
||||
</div>
|
||||
<div class="alpha-note">Threshold α = {alpha:.1%} · showing up to 200 rows</div>
|
||||
<div class="card-body" style="padding:0">
|
||||
{'<div class="no-broken">✓ No broken months detected at this threshold.</div>' if n_broken_kpi == 0 else f"""
|
||||
<table>
|
||||
<thead><tr>
|
||||
<th>Date</th><th>ISIN</th>
|
||||
<th class="right">Q(t-1)</th><th class="right">Q(t)</th>
|
||||
<th class="right">Net flow</th><th class="right">Missing</th>
|
||||
<th class="right">Missing % of movement</th><th></th>
|
||||
</tr></thead>
|
||||
<tbody>{detail_rows}</tbody>
|
||||
</table>"""}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="footer">Generated by carmignac_diagnostics.py · Carmignac × ENSAE Data Challenge 2025</div>
|
||||
|
||||
<script>
|
||||
Chart.defaults.color = '#64748b';
|
||||
Chart.defaults.borderColor = '#1e2535';
|
||||
Chart.defaults.font.family = "'IBM Plex Mono', monospace";
|
||||
Chart.defaults.font.size = 11;
|
||||
|
||||
const DATES = {dates_str};
|
||||
const N_BROKEN = {n_broken_js};
|
||||
const N_LAG = {n_lag_js};
|
||||
const TOT_MISS = {total_miss_js};
|
||||
const ISIN_TS = {isin_ts_json};
|
||||
const ALL_DATES = {all_dates_str};
|
||||
|
||||
function tip() {{
|
||||
return {{
|
||||
backgroundColor:'#0d1117', borderColor:'#252a35', borderWidth:1,
|
||||
titleFont:{{family:"'IBM Plex Mono'"}}, bodyFont:{{family:"'IBM Plex Mono'"}}, padding:10
|
||||
}};
|
||||
}}
|
||||
function xAxis() {{
|
||||
return {{ type:'category', ticks:{{maxTicksLimit:10,maxRotation:0}},
|
||||
grid:{{color:'#1a2030'}} }};
|
||||
}}
|
||||
function yAxis(label) {{
|
||||
return {{ grid:{{color:'#1a2030'}},
|
||||
title:{{display:!!label,text:label,color:'#475569'}} }};
|
||||
}}
|
||||
|
||||
// n_genuine per month = N_BROKEN - N_LAG
|
||||
const N_GENUINE = N_BROKEN.map((b,i) => b - (N_LAG[i]||0));
|
||||
|
||||
new Chart(document.getElementById('chartTimeline'), {{
|
||||
type:'bar',
|
||||
data:{{
|
||||
labels: DATES,
|
||||
datasets:[
|
||||
{{ label:'Genuine gaps', data:N_GENUINE,
|
||||
backgroundColor:'#ef444488', borderColor:'#ef4444', borderWidth:1, borderRadius:2 }},
|
||||
{{ label:'Likely lags', data:N_LAG,
|
||||
backgroundColor:'#f59e0b88', borderColor:'#f59e0b', borderWidth:1, borderRadius:2 }},
|
||||
]
|
||||
}},
|
||||
options:{{
|
||||
responsive:true, maintainAspectRatio:false,
|
||||
interaction:{{mode:'index',intersect:false}},
|
||||
plugins:{{
|
||||
legend:{{position:'top',labels:{{boxWidth:12,padding:16}}}},
|
||||
tooltip:tip()
|
||||
}},
|
||||
scales:{{ x:xAxis(), y:{{...yAxis('# (isin, month) pairs'), stacked:true}} }},
|
||||
}}
|
||||
}});
|
||||
|
||||
new Chart(document.getElementById('chartMissing'), {{
|
||||
type:'bar',
|
||||
data:{{
|
||||
labels: DATES,
|
||||
datasets:[{{ label:'|Missing flow| (shares)', data:TOT_MISS,
|
||||
backgroundColor:'#dc262688', borderColor:'#dc2626',
|
||||
borderWidth:1, borderRadius:2 }}]
|
||||
}},
|
||||
options:{{
|
||||
responsive:true, maintainAspectRatio:false,
|
||||
plugins:{{legend:{{display:false}}, tooltip:tip()}},
|
||||
scales:{{ x:xAxis(), y:yAxis('Shares') }}
|
||||
}}
|
||||
}});
|
||||
|
||||
new Chart(document.getElementById('chartIsinTs'), {{
|
||||
type:'line',
|
||||
data:{{ labels:ALL_DATES, datasets:ISIN_TS }},
|
||||
options:{{
|
||||
responsive:true, maintainAspectRatio:false,
|
||||
interaction:{{mode:'index',intersect:false}},
|
||||
plugins:{{
|
||||
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
|
||||
tooltip:tip()
|
||||
}},
|
||||
scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
|
||||
}}
|
||||
}});
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
return html
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# 5. MAIN
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Detect broken months in Carmignac AUM/Flows data"
|
||||
)
|
||||
parser.add_argument("--out", default="carmignac_broken_months.csv",
|
||||
help="Machine-readable output (loaded by carmignac_repair.py)")
|
||||
parser.add_argument("--html", default="carmignac_diagnostics.html")
|
||||
parser.add_argument("--alpha", type=float, default=0.15,
|
||||
help="Tolerance threshold (default 0.15 = 15%%)")
|
||||
parser.add_argument("--lag", type=int, default=3,
|
||||
help="Boundary days to test for accounting lag (default 3)")
|
||||
args = parser.parse_args()
|
||||
|
||||
def resolve(p):
|
||||
if os.path.exists(p):
|
||||
return p
|
||||
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
|
||||
if os.path.exists(alt):
|
||||
return alt
|
||||
sys.exit(f"[ERROR] File not found: {p}")
|
||||
|
||||
print("[Load] AUM")
|
||||
print("[Load] Flows")
|
||||
aum, flows = load_data()
|
||||
|
||||
print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
|
||||
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
||||
|
||||
print_summary(df_broken, df_all, args.alpha)
|
||||
|
||||
# CSV output — this is what carmignac_repair.py will load
|
||||
if len(df_broken):
|
||||
df_broken.to_csv(args.out, index=False)
|
||||
print(f"[Export] Broken months CSV → {args.out}")
|
||||
else:
|
||||
# Write empty file so repair pipeline can always try to load it
|
||||
pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(args.out, index=False)
|
||||
print(f"[Export] No broken months — empty CSV → {args.out}")
|
||||
|
||||
html = build_html(df_broken, df_all, args.alpha)
|
||||
with open(args.html, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
print(f"[Export] HTML report → {args.html}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -68,7 +68,7 @@ def load_data():
|
|||
# 2. AGGREGATE AND DETECT BROKEN MONTHS
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
|
||||
def detect_broken_months(aum, flows, alpha=0.1, lag_days=3):
|
||||
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||
"""
|
||||
For each (isin, month-end t), compute:
|
||||
- Q_agg(t) : total shares held across all accounts
|
||||
|
|
@ -116,7 +116,6 @@ def detect_broken_months(aum, flows, alpha=0.1, lag_days=3):
|
|||
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
||||
"""Aggregate flows with optional boundary extension (in days)."""
|
||||
fc = flows_df.copy()
|
||||
|
||||
def assign_month(d):
|
||||
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
|
||||
for m in months:
|
||||
|
|
@ -165,17 +164,35 @@ def detect_broken_months(aum, flows, alpha=0.1, lag_days=3):
|
|||
continue
|
||||
|
||||
delta = q_curr - q_prev
|
||||
denom = max(abs(q_curr), abs(q_prev), 1e-9)
|
||||
|
||||
# Standard window
|
||||
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
|
||||
missing_std = delta - f_std
|
||||
pct_std = abs(missing_std) / denom
|
||||
|
||||
# Extended lag window
|
||||
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
|
||||
missing_lag = delta - f_lag
|
||||
pct_lag = abs(missing_lag) / denom
|
||||
|
||||
# ── Denominator choice ────────────────────────────────
|
||||
# Normalise by the size of the *movement* (max of delta_AUM
|
||||
# and recorded flow), not by the stock level. This avoids
|
||||
# astronomically large percentages when a position is tiny
|
||||
# but the missing flow is a normal-sized number.
|
||||
#
|
||||
# Interpretation: "what fraction of the expected movement
|
||||
# is unaccounted for?" 100% = the entire movement is missing.
|
||||
#
|
||||
# A minimum absolute threshold (min_abs_shares) suppresses
|
||||
# noise from residual micro-positions (rounding artefacts).
|
||||
min_abs_shares = 1.0 # ignore positions smaller than 1 share
|
||||
movement = max(abs(delta), abs(f_std), min_abs_shares)
|
||||
denom_std = movement
|
||||
|
||||
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
||||
denom_lag = movement_lag
|
||||
|
||||
pct_std = abs(missing_std) / denom_std
|
||||
pct_lag = abs(missing_lag) / denom_lag
|
||||
|
||||
broken_std = pct_std > alpha
|
||||
broken_lag = pct_lag > alpha
|
||||
|
|
@ -196,7 +213,7 @@ def detect_broken_months(aum, flows, alpha=0.1, lag_days=3):
|
|||
"is_lag": is_lag,
|
||||
})
|
||||
|
||||
df_all = pd.DataFrame(rows)
|
||||
df_all = pd.DataFrame(rows)
|
||||
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
||||
return df_broken, df_all
|
||||
|
||||
|
|
@ -208,7 +225,7 @@ def detect_broken_months(aum, flows, alpha=0.1, lag_days=3):
|
|||
def print_summary(df_broken, df_all, alpha):
|
||||
total = len(df_all)
|
||||
n_broken = len(df_broken)
|
||||
n_lag = df_broken["is_lag"].sum()
|
||||
n_lag = df_broken["is_lag"].sum()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CARMIGNAC — Broken Months Diagnostics")
|
||||
|
|
@ -288,7 +305,7 @@ def build_html(df_broken, df_all, alpha):
|
|||
"tension": 0.3,
|
||||
"fill": False,
|
||||
})
|
||||
isin_ts_json = json.dumps(isin_ts_datasets)
|
||||
isin_ts_json = json.dumps(isin_ts_datasets)
|
||||
all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
|
||||
else str(d)[:10] for d in all_dates])
|
||||
|
||||
|
|
@ -321,12 +338,12 @@ def build_html(df_broken, df_all, alpha):
|
|||
</tr>"""
|
||||
|
||||
# KPIs
|
||||
total = len(df_all)
|
||||
total = len(df_all)
|
||||
n_broken_kpi = len(df_broken)
|
||||
n_lag_kpi = int(df_broken["is_lag"].sum())
|
||||
n_genuine = n_broken_kpi - n_lag_kpi
|
||||
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
|
||||
n_isins = df_broken["isin"].nunique()
|
||||
n_lag_kpi = int(df_broken["is_lag"].sum())
|
||||
n_genuine = n_broken_kpi - n_lag_kpi
|
||||
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
|
||||
n_isins = df_broken["isin"].nunique()
|
||||
|
||||
no_broken_msg = ""
|
||||
if n_broken_kpi == 0:
|
||||
|
|
@ -426,7 +443,8 @@ def build_html(df_broken, df_all, alpha):
|
|||
<div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
|
||||
<h1>Broken Months Diagnostics</h1>
|
||||
<div class="header-sub">
|
||||
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}
|
||||
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
|
||||
<span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -491,7 +509,7 @@ def build_html(df_broken, df_all, alpha):
|
|||
<div class="card">
|
||||
<div class="card-header">
|
||||
<span class="card-title">Missing % — top 5 ISINs over time</span>
|
||||
<span class="card-desc">|missing| / max(Q(t), Q(t-1)) per ISIN</span>
|
||||
<span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size</span>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
|
||||
|
|
@ -534,7 +552,7 @@ def build_html(df_broken, df_all, alpha):
|
|||
<th>Date</th><th>ISIN</th>
|
||||
<th class="right">Q(t-1)</th><th class="right">Q(t)</th>
|
||||
<th class="right">Net flow</th><th class="right">Missing</th>
|
||||
<th class="right">Missing %</th><th></th>
|
||||
<th class="right">Missing % of movement</th><th></th>
|
||||
</tr></thead>
|
||||
<tbody>{detail_rows}</tbody>
|
||||
</table>"""}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user