Project_Carmignac/src/repair_challenge/carmignac_diagnostics.py

633 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Broken Months Diagnostics
=====================================================
Detects months where the aggregate stock-flow equation is violated at the ISIN level (across all accounts)
The residual is the "missing flow":
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
This is a market-level check, independent of individual account identity.
It captures:
- Genuinely missing flow records
- End-of-month accounting lags (transactions dated at boundary)
- Corporate actions (dividends, splits) not reflected in flows
Outputs
-------
carmignac_broken_months.csv — machine-readable, loaded by carmignac_repair.py
carmignac_diagnostics.html — interactive HTML report
Usage
-----
python carmignac_diagnostics.py
python carmignac_diagnostics.py \\
--aum raw_AUM.csv \\
--flows raw_flows.csv \\
--out carmignac_broken_months.csv \\
--html carmignac_diagnostics.html \\
--alpha 0.02
"""
import argparse
import os
import sys
import numpy as np
import pandas as pd
from helpers import build_html_diagnostics, load_data_diagnostics
# ─────────────────────────────────────────────────────────────
# AGGREGATE AND DETECT BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
For each (isin, month-end t), compute:
- Q_agg(t) : total shares held across all accounts
- Q_agg(t-1) : idem previous month (forward-filled)
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
A month is flagged as "broken" when missing_pct > alpha.
Additionally, a month is flagged as a potential "lag" when:
- It is broken with the standard window
- But would NOT be broken if flows dated within lag_days of EOM
are shifted to the adjacent month
Parameters :
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
lag_days : number of boundary days to test for accounting lag
Returns :
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
df_all : Full DataFrame including non-broken months (for plotting)
"""
# Monthly calendar
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Aggregate AUM per (isin, month-end) ──────────────────────
aum_agg = (
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
.sum()
.reset_index()
.rename(
columns={
"Product - Isin": "isin",
"Centralisation Date": "date",
"Quantity - AUM": "qty_agg",
}
)
)
# Forward-fill sparse panel
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
aum_pivot = aum_pivot.reindex(all_months).ffill()
# ── Aggregate flows per (isin, month-end) — standard window ──
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
"""Aggregate flows with optional boundary extension (in days)."""
fc = flows_df.copy()
def assign_month(d):
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
agg = (
fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
.sum()
.reset_index()
.rename(
columns={
"Product - Isin": "isin",
"month_end": "date",
"Quantity - NetFlows": "flow_agg",
}
)
)
return agg
flows_std = bucket_flows(flows, all_months)
flows_lag = bucket_flows(
flows, all_months, lower_offset=lag_days, upper_offset=lag_days
)
def flows_to_pivot(df, months):
piv = df.pivot(index="date", columns="isin", values="flow_agg")
return piv.reindex(months).fillna(0.0)
fpiv_std = flows_to_pivot(flows_std, all_months)
fpiv_lag = flows_to_pivot(flows_lag, all_months)
# ── Compute residuals ─────────────────────────────────────────
rows = []
isins = aum_pivot.columns.tolist()
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
for isin in isins:
q_curr = (
aum_pivot[isin].get(t_curr, np.nan)
if isin in aum_pivot.columns
else np.nan
)
q_prev = (
aum_pivot[isin].get(t_prev, np.nan)
if isin in aum_pivot.columns
else np.nan
)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
# Standard window
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
missing_std = delta - f_std
# Extended lag window
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
missing_lag = delta - f_lag
# ── Denominator choice ────────────────────────────────
# Normalise by the size of the *movement* (max of delta_AUM
# and recorded flow), not by the stock level. This avoids
# astronomically large percentages when a position is tiny
# but the missing flow is a normal-sized number.
#
# Interpretation: "what fraction of the expected movement
# is unaccounted for?"
#
# A minimum absolute threshold (min_abs_shares) suppresses
# noise from residual micro-positions (rounding artefacts).
min_abs_shares = 1.0 # ignore positions smaller than 1 share
movement = max(abs(delta), abs(f_std), min_abs_shares)
denom_std = movement
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
denom_lag = movement_lag
pct_std = abs(missing_std) / denom_std
pct_lag = abs(missing_lag) / denom_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
# A "lag" month: broken with standard, NOT broken with extended window
is_lag = broken_std and (not broken_lag)
rows.append(
{
"date": t_curr,
"isin": isin,
"q_agg_prev": round(q_prev, 3),
"q_agg_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_agg": round(f_std, 3),
"missing_flow": round(missing_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
}
)
df_all = pd.DataFrame(rows)
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
return df_broken, df_all
# ─────────────────────────────────────────────────────────────
# AGGREGATE (CROSS-ISIN) BROKEN MONTHS
# ─────────────────────────────────────────────────────────────
def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
"""
Same stock-flow check as detect_broken_months, but aggregated
across ALL ISINs for each month:
Q_total(t) - Q_total(t-1) != F_total(t)
where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
This catches months where the global portfolio is incoherent even
if every individual ISIN is fine (e.g. cross-ISIN netting errors),
and provides a cleaner high-level view.
Returns :
df_agg : DataFrame indexed by month with columns:
q_total_prev, q_total_curr, delta_aum,
flow_total, missing_flow, missing_pct, broken, is_lag
"""
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── Total AUM per month (all ISIN, all accounts) ─────────────
aum_monthly = (
aum.groupby("Centralisation Date")["Quantity - AUM"]
.sum()
.reindex(all_months)
.ffill()
.rename("q_total")
)
# ── Bucket flows helper (reuse same window logic) ─────────────
def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
fc = flows_df.copy()
def assign_month(d):
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
lo = eom_prev - pd.Timedelta(days=lower_offset)
hi = m + pd.Timedelta(days=upper_offset)
if lo < d <= hi:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
return (
fc.groupby("month_end")["Quantity - NetFlows"]
.sum()
.reindex(months)
.fillna(0.0)
)
flow_std = bucket_total_flows(flows, all_months)
flow_lag = bucket_total_flows(
flows, all_months, lower_offset=lag_days, upper_offset=lag_days
)
# ── Compute residuals ─────────────────────────────────────────
rows = []
min_abs_shares = 1.0
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
q_curr = aum_monthly.get(t_curr, np.nan)
q_prev = aum_monthly.get(t_prev, np.nan)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
f_std = flow_std.get(t_curr, 0.0)
f_lag = flow_lag.get(t_curr, 0.0)
miss_std = delta - f_std
miss_lag = delta - f_lag
movement_std = max(abs(delta), abs(f_std), min_abs_shares)
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
pct_std = abs(miss_std) / movement_std
pct_lag = abs(miss_lag) / movement_lag
broken_std = pct_std > alpha
broken_lag = pct_lag > alpha
is_lag = broken_std and (not broken_lag)
rows.append(
{
"date": t_curr,
"q_total_prev": round(q_prev, 3),
"q_total_curr": round(q_curr, 3),
"delta_aum": round(delta, 3),
"flow_total": round(f_std, 3),
"missing_flow": round(miss_std, 3),
"missing_pct": round(pct_std, 6),
"broken": broken_std,
"is_lag": is_lag,
}
)
df_agg = pd.DataFrame(rows)
return df_agg
# ─────────────────────────────────────────────────────────────
# ERROR ACCOUNT
# ─────────────────────────────────────────────────────────────
def build_error_account(aum, flows, lag_days=3):
"""
Builds a synthetic "error account" that absorbs the stock-flow
residuals that cannot be explained by recorded flows.
Construction (backwards from t_ref):
Stock_error(t_ref) = 0 (by definition)
Stock_error(t-1) = Stock_error(t) - Residual(t)
where Residual(t) = [Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)] - Σ_r F_{r,s}(t)
for each ISIN s independently.
By construction, adding this error account to the AUM restores the
stock-flow equality at every (isin, month).
Also computes an aggregated error account (summed over all ISINs).
Returns
-------
df_err_isin : DataFrame with columns
(date, isin, residual, stock_error, stock_error_pct)
where stock_error_pct = stock_error / max(total_isin_aum, 1)
df_err_agg : DataFrame with columns
(date, residual_agg, stock_error_agg, stock_error_agg_pct)
"""
t_min = aum["Centralisation Date"].min()
t_max = aum["Centralisation Date"].max()
all_months = pd.date_range(t_min, t_max, freq="ME")
# ── ISIN-level AUM panel (forward-filled) ────────────────────
aum_agg = (
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
.sum()
.reset_index()
.rename(
columns={
"Product - Isin": "isin",
"Centralisation Date": "date",
"Quantity - AUM": "qty",
}
)
)
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty")
aum_pivot = aum_pivot.reindex(all_months).ffill()
# ── ISIN-level flow aggregation (standard window) ─────────────
def bucket_isin_flows(flows_df, months):
fc = flows_df.copy()
def assign_month(d):
for m in months:
eom_prev = m - pd.offsets.MonthEnd(1)
if eom_prev < d <= m:
return m
return pd.NaT
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
fc = fc.dropna(subset=["month_end"])
return (
fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
.sum()
.unstack("Product - Isin")
.reindex(months)
.fillna(0.0)
)
flow_pivot = bucket_isin_flows(flows, all_months)
# ── Compute residuals per (isin, month) ───────────────────────
isins = aum_pivot.columns.tolist()
# residual[t] = delta_AUM[t] - flow[t]
residuals = {} # {isin: Series indexed by month}
for isin in isins:
res_series = {}
for i in range(1, len(all_months)):
t_curr = all_months[i]
t_prev = all_months[i - 1]
q_curr = aum_pivot[isin].get(t_curr, np.nan)
q_prev = aum_pivot[isin].get(t_prev, np.nan)
if pd.isna(q_curr) or pd.isna(q_prev):
continue
delta = q_curr - q_prev
f = flow_pivot[isin].get(t_curr, 0.0) if isin in flow_pivot.columns else 0.0
res_series[t_curr] = delta - f
residuals[isin] = pd.Series(res_series)
# ── Build error stock backwards from t_ref ────────────────────
t_ref = all_months[-1]
rows_isin = []
for isin in isins:
res = residuals[isin]
# Maximum AUM for this ISIN (for normalisation)
max_aum = aum_pivot[isin].max()
if pd.isna(max_aum) or max_aum < 1:
max_aum = 1.0
# Propagate backwards: stock(t_ref) = 0
stock = 0.0
# Build dict keyed by date
stock_by_date = {t_ref: 0.0}
for i in range(len(all_months) - 2, -1, -1):
t_curr = all_months[i + 1]
t_prev = all_months[i]
r = res.get(t_curr, 0.0)
stock = stock - r
stock_by_date[t_prev] = stock
for t in all_months:
s = stock_by_date.get(t, np.nan)
r = res.get(t, 0.0)
rows_isin.append(
{
"date": t,
"isin": isin,
"residual": round(r, 4),
"stock_error": round(s, 4) if not pd.isna(s) else np.nan,
"stock_error_pct": round(abs(s) / max_aum * 100, 4)
if not pd.isna(s)
else np.nan,
}
)
df_err_isin = pd.DataFrame(rows_isin).sort_values(["date", "isin"])
# ── Aggregated error account ──────────────────────────────────
# Total AUM across all ISINs at each month
total_aum_by_month = aum_pivot.sum(axis=1)
max_total_aum = total_aum_by_month.max()
if pd.isna(max_total_aum) or max_total_aum < 1:
max_total_aum = 1.0
# Aggregate residual = sum of ISIN residuals
agg_res = {}
for i in range(1, len(all_months)):
t_curr = all_months[i]
total_r = sum(residuals[isin].get(t_curr, 0.0) for isin in isins)
agg_res[t_curr] = total_r
agg_stock = 0.0
agg_stock_by_date = {t_ref: 0.0}
for i in range(len(all_months) - 2, -1, -1):
t_curr = all_months[i + 1]
t_prev = all_months[i]
agg_stock = agg_stock - agg_res.get(t_curr, 0.0)
agg_stock_by_date[t_prev] = agg_stock
rows_agg = []
for t in all_months:
s = agg_stock_by_date.get(t, np.nan)
r = agg_res.get(t, 0.0)
rows_agg.append(
{
"date": t,
"residual_agg": round(r, 4),
"stock_error_agg": round(s, 4) if not pd.isna(s) else np.nan,
"stock_error_agg_pct": round(abs(s) / max_total_aum * 100, 4)
if not pd.isna(s)
else np.nan,
}
)
df_err_agg = pd.DataFrame(rows_agg).sort_values("date")
return df_err_isin, df_err_agg
# ─────────────────────────────────────────────────────────────
# PRINT SUMMARY
# ─────────────────────────────────────────────────────────────
def print_summary(df_broken, df_all, alpha):
total = len(df_all)
n_broken = len(df_broken)
n_lag = df_broken["is_lag"].sum()
print("\n" + "=" * 60)
print(" CARMIGNAC — Broken Months Diagnostics")
print("=" * 60)
print(f" (isin, month) pairs examined : {total}")
print(
f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
f"({n_broken / total * 100:.1f}%)"
)
print(f" Of which likely lag : {n_lag}")
print(f" Of which genuine gap : {n_broken - n_lag}")
if n_broken:
print("\n Top 10 by missing_pct:")
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
print(df_broken[cols].head(10).to_string(index=False))
# Monthly breakdown
by_month = (
df_broken.groupby("date")
.agg(
n_broken=("isin", "count"),
total_missing=("missing_flow", lambda x: x.abs().sum()),
)
.sort_values("n_broken", ascending=False)
.head(5)
)
if len(by_month):
print("\n Most affected months:")
print(by_month.to_string())
print()
# ─────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="Detect broken months in Carmignac AUM/Flows data"
)
parser.add_argument(
"--out",
default="carmignac_broken_months.csv",
help="Machine-readable output (loaded by carmignac_repair.py)",
)
parser.add_argument("--html", default="carmignac_diagnostics.html")
parser.add_argument(
"--alpha",
type=float,
default=0.05,
help="Tolerance threshold (default 0.05 = 5%%)",
)
parser.add_argument(
"--lag",
type=int,
default=3,
help="Boundary days to test for accounting lag (default 3)",
)
args = parser.parse_args()
def resolve(p):
if os.path.exists(p):
return p
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
if os.path.exists(alt):
return alt
sys.exit(f"[ERROR] File not found: {p}")
print("[Load] AUM")
print("[Load] Flows")
aum, flows = load_data_diagnostics()
print(
f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)..."
)
df_broken, df_all = detect_broken_months(
aum, flows, alpha=args.alpha, lag_days=args.lag
)
df_agg = detect_aggregate_broken_months(
aum, flows, alpha=args.alpha, lag_days=args.lag
)
print("\n[Error account] Building error account...")
df_err_isin, df_err_agg = build_error_account(aum, flows, lag_days=args.lag)
print_summary(df_broken, df_all, args.alpha)
n_agg_broken = int(df_agg["broken"].sum())
print(
f" Aggregate broken months : {n_agg_broken} "
f"(of which lags: {int(df_agg['is_lag'].sum())})"
)
max_err = float(df_err_agg["stock_error_agg"].abs().max())
print(
f" Max aggregate error stock : {max_err:,.1f} shares "
f"({float(df_err_agg['stock_error_agg_pct'].max()):.3f}% of total AUM)"
)
# CSV output — this is what carmignac_repair.py loads
if len(df_broken):
df_broken.to_csv(args.out, index=False)
print(f"[Export] Broken months CSV → {args.out}")
else:
pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(
args.out, index=False
)
print(f"[Export] No broken months — empty CSV → {args.out}")
# Error account CSV
err_out = args.out.replace("broken_months", "error_account")
df_err_isin.to_csv(err_out, index=False)
err_agg_out = err_out.replace("error_account", "error_account_agg")
df_err_agg.to_csv(err_agg_out, index=False)
print(f"[Export] Error account (ISIN) → {err_out}")
print(f"[Export] Error account (agg) → {err_agg_out}")
html = build_html_diagnostics(
df_broken, df_all, df_agg, df_err_isin, df_err_agg, args.alpha
)
with open(args.html, "w", encoding="utf-8") as f:
f.write(html)
print(f"[Export] HTML report → {args.html}")
if __name__ == "__main__":
main()