paco-dev #2
|
|
@ -1,962 +0,0 @@
|
||||||
"""
|
|
||||||
Carmignac Data Challenge — Broken Months Diagnostics
|
|
||||||
=====================================================
|
|
||||||
Detects months where the aggregate stock-flow equation is violated
|
|
||||||
at the ISIN level (across all accounts):
|
|
||||||
|
|
||||||
Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1) ≠ Σ_r F_{r,s}(t-1→t)
|
|
||||||
|
|
||||||
The residual is the "missing flow":
|
|
||||||
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
|
||||||
|
|
||||||
This is a market-level check, independent of individual account identity.
|
|
||||||
It captures:
|
|
||||||
- Genuinely missing flow records
|
|
||||||
- End-of-month accounting lags (transactions dated at boundary)
|
|
||||||
- Corporate actions (dividends, splits) not reflected in flows
|
|
||||||
|
|
||||||
Outputs
|
|
||||||
-------
|
|
||||||
carmignac_broken_months.csv — machine-readable, loaded by carmignac_repair.py
|
|
||||||
carmignac_diagnostics.html — interactive HTML report
|
|
||||||
|
|
||||||
Usage
|
|
||||||
-----
|
|
||||||
python carmignac_diagnostics.py
|
|
||||||
python carmignac_diagnostics.py \\
|
|
||||||
--aum raw_AUM.csv \\
|
|
||||||
--flows raw_flows.csv \\
|
|
||||||
--out carmignac_broken_months.csv \\
|
|
||||||
--html carmignac_diagnostics.html \\
|
|
||||||
--alpha 0.02
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 1. LOAD
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def load_data(aum_path, flows_path):
|
|
||||||
aum = pd.read_csv(aum_path, parse_dates=["Centralisation Date"])
|
|
||||||
flows = pd.read_csv(flows_path, parse_dates=["Centralisation Date"])
|
|
||||||
aum["Product - Isin"] = aum["Product - Isin"].astype(str)
|
|
||||||
flows["Product - Isin"] = flows["Product - Isin"].astype(str)
|
|
||||||
return aum, flows
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 2. AGGREGATE AND DETECT BROKEN MONTHS
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
|
||||||
"""
|
|
||||||
For each (isin, month-end t), compute:
|
|
||||||
- Q_agg(t) : total shares held across all accounts
|
|
||||||
- Q_agg(t-1) : idem previous month (forward-filled)
|
|
||||||
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
|
|
||||||
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
|
||||||
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
|
|
||||||
|
|
||||||
A month is flagged as "broken" when missing_pct > alpha.
|
|
||||||
|
|
||||||
Additionally, a month is flagged as a potential "lag" when:
|
|
||||||
- It is broken with the standard window
|
|
||||||
- But would NOT be broken if flows dated within lag_days of EOM
|
|
||||||
are shifted to the adjacent month
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
|
|
||||||
lag_days : number of boundary days to test for accounting lag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
|
|
||||||
df_all : Full DataFrame including non-broken months (for plotting)
|
|
||||||
"""
|
|
||||||
# Monthly calendar
|
|
||||||
t_min = aum["Centralisation Date"].min()
|
|
||||||
t_max = aum["Centralisation Date"].max()
|
|
||||||
all_months = pd.date_range(t_min, t_max, freq="ME")
|
|
||||||
|
|
||||||
# ── Aggregate AUM per (isin, month-end) ──────────────────────
|
|
||||||
aum_agg = (
|
|
||||||
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
|
|
||||||
.sum()
|
|
||||||
.reset_index()
|
|
||||||
.rename(columns={"Product - Isin": "isin",
|
|
||||||
"Centralisation Date": "date",
|
|
||||||
"Quantity - AUM": "qty_agg"})
|
|
||||||
)
|
|
||||||
# Forward-fill sparse panel
|
|
||||||
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
|
|
||||||
aum_pivot = aum_pivot.reindex(all_months).ffill()
|
|
||||||
|
|
||||||
# ── Aggregate flows per (isin, month-end) — standard window ──
|
|
||||||
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
|
||||||
"""Aggregate flows with optional boundary extension (in days)."""
|
|
||||||
fc = flows_df.copy()
|
|
||||||
def assign_month(d):
|
|
||||||
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
|
|
||||||
for m in months:
|
|
||||||
eom_prev = m - pd.offsets.MonthEnd(1)
|
|
||||||
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
|
||||||
hi = m + pd.Timedelta(days=upper_offset)
|
|
||||||
if lo < d <= hi:
|
|
||||||
return m
|
|
||||||
return pd.NaT
|
|
||||||
|
|
||||||
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
|
||||||
fc = fc.dropna(subset=["month_end"])
|
|
||||||
agg = (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
|
|
||||||
.sum()
|
|
||||||
.reset_index()
|
|
||||||
.rename(columns={"Product - Isin": "isin",
|
|
||||||
"month_end": "date",
|
|
||||||
"Quantity - NetFlows": "flow_agg"}))
|
|
||||||
return agg
|
|
||||||
|
|
||||||
flows_std = bucket_flows(flows, all_months)
|
|
||||||
flows_lag = bucket_flows(flows, all_months,
|
|
||||||
lower_offset=lag_days,
|
|
||||||
upper_offset=lag_days)
|
|
||||||
|
|
||||||
def flows_to_pivot(df, months):
|
|
||||||
piv = df.pivot(index="date", columns="isin", values="flow_agg")
|
|
||||||
return piv.reindex(months).fillna(0.0)
|
|
||||||
|
|
||||||
fpiv_std = flows_to_pivot(flows_std, all_months)
|
|
||||||
fpiv_lag = flows_to_pivot(flows_lag, all_months)
|
|
||||||
|
|
||||||
# ── Compute residuals ─────────────────────────────────────────
|
|
||||||
rows = []
|
|
||||||
isins = aum_pivot.columns.tolist()
|
|
||||||
|
|
||||||
for i in range(1, len(all_months)):
|
|
||||||
t_curr = all_months[i]
|
|
||||||
t_prev = all_months[i - 1]
|
|
||||||
|
|
||||||
for isin in isins:
|
|
||||||
q_curr = aum_pivot[isin].get(t_curr, np.nan) if isin in aum_pivot.columns else np.nan
|
|
||||||
q_prev = aum_pivot[isin].get(t_prev, np.nan) if isin in aum_pivot.columns else np.nan
|
|
||||||
|
|
||||||
if pd.isna(q_curr) or pd.isna(q_prev):
|
|
||||||
continue
|
|
||||||
|
|
||||||
delta = q_curr - q_prev
|
|
||||||
|
|
||||||
# Standard window
|
|
||||||
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
|
|
||||||
missing_std = delta - f_std
|
|
||||||
|
|
||||||
# Extended lag window
|
|
||||||
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
|
|
||||||
missing_lag = delta - f_lag
|
|
||||||
|
|
||||||
# ── Denominator choice ────────────────────────────────
|
|
||||||
# Normalise by the size of the *movement* (max of delta_AUM
|
|
||||||
# and recorded flow), not by the stock level. This avoids
|
|
||||||
# astronomically large percentages when a position is tiny
|
|
||||||
# but the missing flow is a normal-sized number.
|
|
||||||
#
|
|
||||||
# Interpretation: "what fraction of the expected movement
|
|
||||||
# is unaccounted for?" 100% = the entire movement is missing.
|
|
||||||
#
|
|
||||||
# A minimum absolute threshold (min_abs_shares) suppresses
|
|
||||||
# noise from residual micro-positions (rounding artefacts).
|
|
||||||
min_abs_shares = 1.0 # ignore positions smaller than 1 share
|
|
||||||
movement = max(abs(delta), abs(f_std), min_abs_shares)
|
|
||||||
denom_std = movement
|
|
||||||
|
|
||||||
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
|
||||||
denom_lag = movement_lag
|
|
||||||
|
|
||||||
pct_std = abs(missing_std) / denom_std
|
|
||||||
pct_lag = abs(missing_lag) / denom_lag
|
|
||||||
|
|
||||||
broken_std = pct_std > alpha
|
|
||||||
broken_lag = pct_lag > alpha
|
|
||||||
|
|
||||||
# A "lag" month: broken with standard, NOT broken with extended window
|
|
||||||
is_lag = broken_std and (not broken_lag)
|
|
||||||
|
|
||||||
rows.append({
|
|
||||||
"date": t_curr,
|
|
||||||
"isin": isin,
|
|
||||||
"q_agg_prev": round(q_prev, 3),
|
|
||||||
"q_agg_curr": round(q_curr, 3),
|
|
||||||
"delta_aum": round(delta, 3),
|
|
||||||
"flow_agg": round(f_std, 3),
|
|
||||||
"missing_flow": round(missing_std, 3),
|
|
||||||
"missing_pct": round(pct_std, 6),
|
|
||||||
"broken": broken_std,
|
|
||||||
"is_lag": is_lag,
|
|
||||||
})
|
|
||||||
|
|
||||||
df_all = pd.DataFrame(rows)
|
|
||||||
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
|
||||||
return df_broken, df_all
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
|
||||||
"""
|
|
||||||
Same stock-flow check as detect_broken_months, but aggregated
|
|
||||||
across ALL ISINs for each month:
|
|
||||||
|
|
||||||
Q_total(t) - Q_total(t-1) != F_total(t)
|
|
||||||
|
|
||||||
where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
|
|
||||||
|
|
||||||
This catches months where the global portfolio is incoherent even
|
|
||||||
if every individual ISIN is fine (e.g. cross-ISIN netting errors),
|
|
||||||
and provides a cleaner high-level view.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_agg : DataFrame indexed by month with columns:
|
|
||||||
q_total_prev, q_total_curr, delta_aum, flow_total,
|
|
||||||
missing_flow, missing_pct, broken, is_lag
|
|
||||||
"""
|
|
||||||
t_min = aum["Centralisation Date"].min()
|
|
||||||
t_max = aum["Centralisation Date"].max()
|
|
||||||
all_months = pd.date_range(t_min, t_max, freq="ME")
|
|
||||||
|
|
||||||
# ── Total AUM per month (all ISIN, all accounts) ─────────────
|
|
||||||
aum_monthly = (
|
|
||||||
aum.groupby("Centralisation Date")["Quantity - AUM"]
|
|
||||||
.sum()
|
|
||||||
.reindex(all_months)
|
|
||||||
.ffill()
|
|
||||||
.rename("q_total")
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Bucket flows helper (reuse same window logic) ─────────────
|
|
||||||
def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
|
||||||
fc = flows_df.copy()
|
|
||||||
def assign_month(d):
|
|
||||||
for m in months:
|
|
||||||
eom_prev = m - pd.offsets.MonthEnd(1)
|
|
||||||
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
|
||||||
hi = m + pd.Timedelta(days=upper_offset)
|
|
||||||
if lo < d <= hi:
|
|
||||||
return m
|
|
||||||
return pd.NaT
|
|
||||||
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
|
||||||
fc = fc.dropna(subset=["month_end"])
|
|
||||||
return (fc.groupby("month_end")["Quantity - NetFlows"]
|
|
||||||
.sum()
|
|
||||||
.reindex(months)
|
|
||||||
.fillna(0.0))
|
|
||||||
|
|
||||||
flow_std = bucket_total_flows(flows, all_months)
|
|
||||||
flow_lag = bucket_total_flows(flows, all_months,
|
|
||||||
lower_offset=lag_days, upper_offset=lag_days)
|
|
||||||
|
|
||||||
# ── Compute residuals ─────────────────────────────────────────
|
|
||||||
rows = []
|
|
||||||
min_abs_shares = 1.0
|
|
||||||
|
|
||||||
for i in range(1, len(all_months)):
|
|
||||||
t_curr = all_months[i]
|
|
||||||
t_prev = all_months[i - 1]
|
|
||||||
|
|
||||||
q_curr = aum_monthly.get(t_curr, np.nan)
|
|
||||||
q_prev = aum_monthly.get(t_prev, np.nan)
|
|
||||||
if pd.isna(q_curr) or pd.isna(q_prev):
|
|
||||||
continue
|
|
||||||
|
|
||||||
delta = q_curr - q_prev
|
|
||||||
|
|
||||||
f_std = flow_std.get(t_curr, 0.0)
|
|
||||||
f_lag = flow_lag.get(t_curr, 0.0)
|
|
||||||
miss_std = delta - f_std
|
|
||||||
miss_lag = delta - f_lag
|
|
||||||
|
|
||||||
movement_std = max(abs(delta), abs(f_std), min_abs_shares)
|
|
||||||
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
|
||||||
pct_std = abs(miss_std) / movement_std
|
|
||||||
pct_lag = abs(miss_lag) / movement_lag
|
|
||||||
|
|
||||||
broken_std = pct_std > alpha
|
|
||||||
broken_lag = pct_lag > alpha
|
|
||||||
is_lag = broken_std and (not broken_lag)
|
|
||||||
|
|
||||||
rows.append({
|
|
||||||
"date": t_curr,
|
|
||||||
"q_total_prev": round(q_prev, 3),
|
|
||||||
"q_total_curr": round(q_curr, 3),
|
|
||||||
"delta_aum": round(delta, 3),
|
|
||||||
"flow_total": round(f_std, 3),
|
|
||||||
"missing_flow": round(miss_std, 3),
|
|
||||||
"missing_pct": round(pct_std, 6),
|
|
||||||
"broken": broken_std,
|
|
||||||
"is_lag": is_lag,
|
|
||||||
})
|
|
||||||
|
|
||||||
df_agg = pd.DataFrame(rows)
|
|
||||||
return df_agg
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 3. PRINT SUMMARY
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def print_summary(df_broken, df_all, alpha):
|
|
||||||
total = len(df_all)
|
|
||||||
n_broken = len(df_broken)
|
|
||||||
n_lag = df_broken["is_lag"].sum()
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print(" CARMIGNAC — Broken Months Diagnostics")
|
|
||||||
print("=" * 60)
|
|
||||||
print(f" (isin, month) pairs examined : {total}")
|
|
||||||
print(f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
|
|
||||||
f"({n_broken/total*100:.1f}%)")
|
|
||||||
print(f" Of which likely lag : {n_lag}")
|
|
||||||
print(f" Of which genuine gap : {n_broken - n_lag}")
|
|
||||||
|
|
||||||
if n_broken:
|
|
||||||
print("\n Top 10 by missing_pct:")
|
|
||||||
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
|
|
||||||
print(df_broken[cols].head(10).to_string(index=False))
|
|
||||||
|
|
||||||
# Monthly breakdown
|
|
||||||
by_month = (df_broken.groupby("date")
|
|
||||||
.agg(n_broken=("isin", "count"),
|
|
||||||
total_missing=("missing_flow", lambda x: x.abs().sum()))
|
|
||||||
.sort_values("n_broken", ascending=False)
|
|
||||||
.head(5))
|
|
||||||
if len(by_month):
|
|
||||||
print("\n Most affected months:")
|
|
||||||
print(by_month.to_string())
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 4. BUILD HTML REPORT
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def build_html(df_broken, df_all, df_agg, alpha):
|
|
||||||
# ── JS-ready data ────────────────────────────────────────────
|
|
||||||
# Timeline: n_broken and total_missing per month
|
|
||||||
tl = (df_all[df_all["broken"]]
|
|
||||||
.groupby("date")
|
|
||||||
.agg(n_broken=("isin", "count"),
|
|
||||||
total_missing=("missing_flow", lambda x: x.abs().sum()),
|
|
||||||
n_lag=("is_lag", "sum"))
|
|
||||||
.reindex(df_all["date"].sort_values().unique())
|
|
||||||
.fillna(0))
|
|
||||||
tl.index = pd.to_datetime(tl.index)
|
|
||||||
dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index])
|
|
||||||
|
|
||||||
def jf(arr, dec=4):
|
|
||||||
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
|
|
||||||
|
|
||||||
n_broken_js = jf(tl["n_broken"].values, 0)
|
|
||||||
total_miss_js = jf(tl["total_missing"].values)
|
|
||||||
n_lag_js = jf(tl["n_lag"].values, 0)
|
|
||||||
|
|
||||||
# Aggregate (cross-ISIN) JS data
|
|
||||||
agg_dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])])
|
|
||||||
agg_delta_js = jf(df_agg["delta_aum"].values)
|
|
||||||
agg_flow_js = jf(df_agg["flow_total"].values)
|
|
||||||
agg_missing_js = jf(df_agg["missing_flow"].values)
|
|
||||||
agg_pct_js = jf((df_agg["missing_pct"] * 100).values)
|
|
||||||
|
|
||||||
# Aggregate KPIs
|
|
||||||
n_agg_broken = int(df_agg["broken"].sum())
|
|
||||||
n_agg_lag = int(df_agg["is_lag"].sum())
|
|
||||||
n_agg_genuine = n_agg_broken - n_agg_lag
|
|
||||||
max_agg_pct = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0
|
|
||||||
|
|
||||||
# Aggregate detail table rows
|
|
||||||
agg_rows = []
|
|
||||||
for _, r in df_agg[df_agg["broken"]].iterrows():
|
|
||||||
lb = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
|
|
||||||
pc = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
|
|
||||||
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
|
|
||||||
mc = "miss-neg" if r["missing_flow"] < 0 else "miss-pos"
|
|
||||||
agg_rows.append(
|
|
||||||
f'<tr><td>{ds}</td>'
|
|
||||||
f'<td class="mono right">{r["q_total_prev"]:,.1f}</td>'
|
|
||||||
f'<td class="mono right">{r["q_total_curr"]:,.1f}</td>'
|
|
||||||
f'<td class="mono right">{r["flow_total"]:,.1f}</td>'
|
|
||||||
f'<td class="mono right {mc}">{r["missing_flow"]:+,.1f}</td>'
|
|
||||||
f'<td class="mono right {pc}">{r["missing_pct"]*100:.2f}%</td>'
|
|
||||||
f'<td>{lb}</td></tr>'
|
|
||||||
)
|
|
||||||
agg_detail_rows = "".join(agg_rows) if agg_rows else (
|
|
||||||
'<tr><td colspan="7" style="padding:24px;text-align:center;'
|
|
||||||
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Per-ISIN summary
|
|
||||||
isin_sum = (df_broken.groupby("isin")
|
|
||||||
.agg(n_months=("date", "count"),
|
|
||||||
avg_pct=("missing_pct", "mean"),
|
|
||||||
total_abs=("missing_flow", lambda x: x.abs().sum()))
|
|
||||||
.sort_values("total_abs", ascending=False))
|
|
||||||
|
|
||||||
ISIN_COLORS = [
|
|
||||||
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
|
|
||||||
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Per-ISIN missing_pct timeseries for the top 5 ISINs
|
|
||||||
top_isins = isin_sum.head(5).index.tolist()
|
|
||||||
all_dates = sorted(df_all["date"].unique())
|
|
||||||
isin_ts_datasets = []
|
|
||||||
for idx, isin in enumerate(top_isins):
|
|
||||||
sub = df_all[df_all["isin"] == isin].set_index("date")["missing_pct"].reindex(all_dates).fillna(0)
|
|
||||||
isin_ts_datasets.append({
|
|
||||||
"label": isin,
|
|
||||||
"data": [round(float(v) * 100, 3) for v in sub.values],
|
|
||||||
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
|
|
||||||
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
|
|
||||||
"borderWidth": 2,
|
|
||||||
"pointRadius": 0,
|
|
||||||
"tension": 0.3,
|
|
||||||
"fill": False,
|
|
||||||
})
|
|
||||||
isin_ts_json = json.dumps(isin_ts_datasets)
|
|
||||||
all_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, 'strftime')
|
|
||||||
else str(d)[:10] for d in all_dates])
|
|
||||||
|
|
||||||
# Detail table rows
|
|
||||||
detail_rows = ""
|
|
||||||
for _, r in df_broken.head(200).iterrows():
|
|
||||||
lag_badge = '<span class="lag-badge">lag</span>' if r["is_lag"] else ""
|
|
||||||
pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med"
|
|
||||||
detail_rows += f"""
|
|
||||||
<tr>
|
|
||||||
<td>{r['date'].strftime('%Y-%m-%d') if hasattr(r['date'], 'strftime') else str(r['date'])[:10]}</td>
|
|
||||||
<td class="mono">{r['isin']}</td>
|
|
||||||
<td class="mono right">{r['q_agg_prev']:,.1f}</td>
|
|
||||||
<td class="mono right">{r['q_agg_curr']:,.1f}</td>
|
|
||||||
<td class="mono right">{r['flow_agg']:,.1f}</td>
|
|
||||||
<td class="mono right {'miss-neg' if r['missing_flow'] < 0 else 'miss-pos'}">{r['missing_flow']:+,.1f}</td>
|
|
||||||
<td class="mono right {pct_class}">{r['missing_pct']*100:.2f}%</td>
|
|
||||||
<td>{lag_badge}</td>
|
|
||||||
</tr>"""
|
|
||||||
|
|
||||||
# ISIN summary table
|
|
||||||
isin_rows = ""
|
|
||||||
for isin, row in isin_sum.iterrows():
|
|
||||||
isin_rows += f"""
|
|
||||||
<tr>
|
|
||||||
<td class="mono">{isin}</td>
|
|
||||||
<td class="mono right">{int(row['n_months'])}</td>
|
|
||||||
<td class="mono right">{row['avg_pct']*100:.2f}%</td>
|
|
||||||
<td class="mono right">{row['total_abs']:,.1f}</td>
|
|
||||||
</tr>"""
|
|
||||||
|
|
||||||
# KPIs
|
|
||||||
total = len(df_all)
|
|
||||||
n_broken_kpi = len(df_broken)
|
|
||||||
n_lag_kpi = int(df_broken["is_lag"].sum())
|
|
||||||
n_genuine = n_broken_kpi - n_lag_kpi
|
|
||||||
max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0
|
|
||||||
n_isins = df_broken["isin"].nunique()
|
|
||||||
|
|
||||||
no_broken_msg = ""
|
|
||||||
if n_broken_kpi == 0:
|
|
||||||
no_broken_msg = '<div class="no-broken">✓ No broken months detected at this threshold.</div>'
|
|
||||||
|
|
||||||
html = f"""<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
||||||
<title>Carmignac — Broken Months Diagnostics</title>
|
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
|
|
||||||
<style>
|
|
||||||
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;600;700&display=swap');
|
|
||||||
|
|
||||||
:root {{
|
|
||||||
--bg: #0d0f12; --surface: #151820; --border: #252a35;
|
|
||||||
--accent: #3b82f6; --warn: #f59e0b; --danger: #ef4444;
|
|
||||||
--success: #10b981; --text: #e2e8f0; --muted: #64748b;
|
|
||||||
--mono: 'IBM Plex Mono', monospace;
|
|
||||||
--sans: 'IBM Plex Sans', sans-serif;
|
|
||||||
}}
|
|
||||||
*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
|
||||||
body {{ font-family: var(--sans); background: var(--bg); color: var(--text);
|
|
||||||
padding: 0 0 60px; }}
|
|
||||||
|
|
||||||
.header {{ background: linear-gradient(135deg,#0d1117,#111827,#1a0a0a);
|
|
||||||
border-bottom: 1px solid var(--border); padding: 40px 48px 36px; }}
|
|
||||||
.header-eyebrow {{ font-family: var(--mono); font-size: 11px; letter-spacing:.15em;
|
|
||||||
color: var(--danger); text-transform: uppercase; margin-bottom:10px; }}
|
|
||||||
.header h1 {{ font-size: 2rem; font-weight: 700; letter-spacing:-.02em; margin-bottom:8px; }}
|
|
||||||
.header-sub {{ font-size:.85rem; color: var(--muted); font-family: var(--mono); }}
|
|
||||||
|
|
||||||
.kpi-strip {{ display: grid; grid-template-columns: repeat(auto-fit,minmax(160px,1fr));
|
|
||||||
gap: 1px; background: var(--border); border-bottom: 1px solid var(--border); }}
|
|
||||||
.kpi {{ background: var(--surface); padding: 22px 28px;
|
|
||||||
display: flex; flex-direction: column; gap: 4px; }}
|
|
||||||
.kpi-label {{ font-size:.7rem; letter-spacing:.1em; text-transform:uppercase;
|
|
||||||
color: var(--muted); font-family: var(--mono); }}
|
|
||||||
.kpi-value {{ font-size:1.6rem; font-weight:700; font-family: var(--mono); line-height:1; }}
|
|
||||||
.kpi-value.danger {{ color: var(--danger); }}
|
|
||||||
.kpi-value.warn {{ color: var(--warn); }}
|
|
||||||
.kpi-value.success {{ color: var(--success); }}
|
|
||||||
.kpi-sub {{ font-size:.7rem; color: var(--muted); font-family: var(--mono); }}
|
|
||||||
|
|
||||||
.main {{ max-width:1400px; margin:0 auto; padding:36px 48px;
|
|
||||||
display:flex; flex-direction:column; gap:32px; }}
|
|
||||||
|
|
||||||
.card {{ background: var(--surface); border: 1px solid var(--border);
|
|
||||||
border-radius:8px; overflow:hidden; }}
|
|
||||||
.card-header {{ padding:18px 24px 14px; border-bottom:1px solid var(--border);
|
|
||||||
display:flex; align-items:baseline; gap:12px; }}
|
|
||||||
.card-title {{ font-size:.8rem; font-weight:600; letter-spacing:.1em;
|
|
||||||
text-transform:uppercase; color: var(--muted); font-family: var(--mono); }}
|
|
||||||
.card-desc {{ font-size:.78rem; color: #475569; }}
|
|
||||||
.card-body {{ padding:24px; }}
|
|
||||||
.chart-wrap {{ position:relative; height:260px; }}
|
|
||||||
.chart-wrap-tall {{ position:relative; height:320px; }}
|
|
||||||
|
|
||||||
.grid-2 {{ display:grid; grid-template-columns:1fr 1fr; gap:24px; }}
|
|
||||||
@media(max-width:900px) {{ .grid-2 {{ grid-template-columns:1fr; }}
|
|
||||||
.main {{ padding:24px 20px; }} }}
|
|
||||||
|
|
||||||
.section-label {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.15em;
|
|
||||||
text-transform:uppercase; color: var(--muted);
|
|
||||||
padding-left:10px; border-left:3px solid var(--danger);
|
|
||||||
margin-bottom:-8px; }}
|
|
||||||
|
|
||||||
table {{ width:100%; border-collapse:collapse; font-size:.82rem; }}
|
|
||||||
th {{ font-family: var(--mono); font-size:.68rem; letter-spacing:.08em;
|
|
||||||
text-transform:uppercase; color: var(--muted); padding:10px 14px;
|
|
||||||
text-align:left; border-bottom:1px solid var(--border); background:#0f1218; }}
|
|
||||||
td {{ padding:10px 14px; border-bottom:1px solid #1a1f2a; vertical-align:middle; }}
|
|
||||||
tr:last-child td {{ border-bottom:none; }}
|
|
||||||
tr:hover td {{ background:#181e2b; }}
|
|
||||||
.mono {{ font-family: var(--mono); font-size:.78rem; }}
|
|
||||||
.right {{ text-align:right; }}
|
|
||||||
.miss-pos {{ color: var(--warn); }}
|
|
||||||
.miss-neg {{ color: var(--danger); }}
|
|
||||||
.pct-high {{ color: var(--danger); font-weight:600; }}
|
|
||||||
.pct-med {{ color: var(--warn); }}
|
|
||||||
.lag-badge {{ font-family: var(--mono); font-size:.65rem; padding:2px 6px;
|
|
||||||
background:#f59e0b22; border:1px solid #f59e0b66; border-radius:3px;
|
|
||||||
color: var(--warn); }}
|
|
||||||
.no-broken {{ padding:40px; text-align:center; color: var(--success);
|
|
||||||
font-family: var(--mono); font-size:.9rem; }}
|
|
||||||
|
|
||||||
.footer {{ text-align:center; font-family: var(--mono); font-size:.68rem;
|
|
||||||
color:#334155; margin-top:16px; letter-spacing:.05em; }}
|
|
||||||
.alpha-note {{ font-family: var(--mono); font-size:.75rem; color: var(--muted);
|
|
||||||
padding:10px 24px 0; }}
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="header">
|
|
||||||
<div class="header-eyebrow">Carmignac × ENSAE · Data Challenge 2025</div>
|
|
||||||
<h1>Broken Months Diagnostics</h1>
|
|
||||||
<div class="header-sub">
|
|
||||||
Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}<br>
|
|
||||||
<span style='font-size:.78rem'>Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="kpi-strip">
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">(ISIN, month) pairs</span>
|
|
||||||
<span class="kpi-value">{total:,}</span>
|
|
||||||
<span class="kpi-sub">examined</span>
|
|
||||||
</div>
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">Broken months</span>
|
|
||||||
<span class="kpi-value {'danger' if n_broken_kpi > 0 else 'success'}">{n_broken_kpi:,}</span>
|
|
||||||
<span class="kpi-sub">{n_broken_kpi/total*100:.1f}% of pairs</span>
|
|
||||||
</div>
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">Likely lags</span>
|
|
||||||
<span class="kpi-value warn">{n_lag_kpi}</span>
|
|
||||||
<span class="kpi-sub">resolved by ±{3}d window</span>
|
|
||||||
</div>
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">Genuine gaps</span>
|
|
||||||
<span class="kpi-value {'danger' if n_genuine > 0 else 'success'}">{n_genuine}</span>
|
|
||||||
<span class="kpi-sub">unresolved by lag fix</span>
|
|
||||||
</div>
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">ISINs affected</span>
|
|
||||||
<span class="kpi-value">{n_isins}</span>
|
|
||||||
<span class="kpi-sub">distinct ISINs</span>
|
|
||||||
</div>
|
|
||||||
<div class="kpi">
|
|
||||||
<span class="kpi-label">Max missing %</span>
|
|
||||||
<span class="kpi-value {'danger' if max_pct > 10 else 'warn'}">{max_pct:.1f}%</span>
|
|
||||||
<span class="kpi-sub">worst single (isin, month)</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="main">
|
|
||||||
|
|
||||||
<div class="section-label">00 · Aggregate view — all ISINs combined</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Stock-flow equation — total portfolio</span>
|
|
||||||
<span class="card-desc">
|
|
||||||
Σ Q(t) − Σ Q(t−1) vs Σ F(t) across all ISINs and accounts.
|
|
||||||
Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap-tall"><canvas id="chartAggOverlay"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="grid-2">
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Aggregate missing flow over time</span>
|
|
||||||
<span class="card-desc">Σ Q(t) − Σ Q(t−1) − Σ F(t) — should be near zero every month</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap"><canvas id="chartAggMissing"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Aggregate missing % of movement</span>
|
|
||||||
<span class="card-desc">|missing| / max(|ΔAUM|, |flow|) — months above α flagged in red</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap"><canvas id="chartAggPct"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Aggregate broken months — detail</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body" style="padding:0">
|
|
||||||
<table>
|
|
||||||
<thead><tr>
|
|
||||||
<th>Date</th>
|
|
||||||
<th class="right">Σ Q(t−1)</th><th class="right">Σ Q(t)</th>
|
|
||||||
<th class="right">Σ Flow</th><th class="right">Missing</th>
|
|
||||||
<th class="right">Missing %</th><th></th>
|
|
||||||
</tr></thead>
|
|
||||||
<tbody>{agg_detail_rows}</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="section-label">01 · Timeline — per ISIN</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Broken (isin, month) pairs per month</span>
|
|
||||||
<span class="card-desc">Stacked: genuine gaps (red) vs likely accounting lags (amber)</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap-tall"><canvas id="chartTimeline"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="grid-2">
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Total absolute missing flow per month</span>
|
|
||||||
<span class="card-desc">Sum of |missing flow| across all broken ISINs</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap"><canvas id="chartMissing"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">Missing % — top 5 ISINs over time</span>
|
|
||||||
<span class="card-desc">|missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div class="chart-wrap"><canvas id="chartIsinTs"></canvas></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="section-label">02 · By ISIN</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">ISIN summary — most affected</span>
|
|
||||||
</div>
|
|
||||||
<div class="card-body" style="padding:0">
|
|
||||||
{'<div class="no-broken">No broken months detected.</div>' if n_broken_kpi == 0 else f"""
|
|
||||||
<table>
|
|
||||||
<thead><tr>
|
|
||||||
<th>ISIN</th><th>Broken months</th>
|
|
||||||
<th>Avg missing %</th><th>Total |missing| (shares)</th>
|
|
||||||
</tr></thead>
|
|
||||||
<tbody>{isin_rows}</tbody>
|
|
||||||
</table>"""}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="section-label">03 · Detail log</div>
|
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<span class="card-title">All broken (isin, month) pairs</span>
|
|
||||||
<span class="card-desc">
|
|
||||||
<span class="lag-badge">lag</span> = likely resolved by extending flow window ±3 days
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
<div class="alpha-note">Threshold α = {alpha:.1%} · showing up to 200 rows</div>
|
|
||||||
<div class="card-body" style="padding:0">
|
|
||||||
{'<div class="no-broken">✓ No broken months detected at this threshold.</div>' if n_broken_kpi == 0 else f"""
|
|
||||||
<table>
|
|
||||||
<thead><tr>
|
|
||||||
<th>Date</th><th>ISIN</th>
|
|
||||||
<th class="right">Q(t-1)</th><th class="right">Q(t)</th>
|
|
||||||
<th class="right">Net flow</th><th class="right">Missing</th>
|
|
||||||
<th class="right">Missing % of movement</th><th></th>
|
|
||||||
</tr></thead>
|
|
||||||
<tbody>{detail_rows}</tbody>
|
|
||||||
</table>"""}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
<div class="footer">Generated by carmignac_diagnostics.py · Carmignac × ENSAE Data Challenge 2025</div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
Chart.defaults.color = '#64748b';
|
|
||||||
Chart.defaults.borderColor = '#1e2535';
|
|
||||||
Chart.defaults.font.family = "'IBM Plex Mono', monospace";
|
|
||||||
Chart.defaults.font.size = 11;
|
|
||||||
|
|
||||||
const DATES = {dates_str};
|
|
||||||
const N_BROKEN = {n_broken_js};
|
|
||||||
const N_LAG = {n_lag_js};
|
|
||||||
const TOT_MISS = {total_miss_js};
|
|
||||||
const ISIN_TS = {isin_ts_json};
|
|
||||||
const ALL_DATES = {all_dates_str};
|
|
||||||
|
|
||||||
function tip() {{
|
|
||||||
return {{
|
|
||||||
backgroundColor:'#0d1117', borderColor:'#252a35', borderWidth:1,
|
|
||||||
titleFont:{{family:"'IBM Plex Mono'"}}, bodyFont:{{family:"'IBM Plex Mono'"}}, padding:10
|
|
||||||
}};
|
|
||||||
}}
|
|
||||||
function xAxis() {{
|
|
||||||
return {{ type:'category', ticks:{{maxTicksLimit:10,maxRotation:0}},
|
|
||||||
grid:{{color:'#1a2030'}} }};
|
|
||||||
}}
|
|
||||||
function yAxis(label) {{
|
|
||||||
return {{ grid:{{color:'#1a2030'}},
|
|
||||||
title:{{display:!!label,text:label,color:'#475569'}} }};
|
|
||||||
}}
|
|
||||||
|
|
||||||
// n_genuine per month = N_BROKEN - N_LAG
|
|
||||||
const N_GENUINE = N_BROKEN.map((b,i) => b - (N_LAG[i]||0));
|
|
||||||
|
|
||||||
new Chart(document.getElementById('chartTimeline'), {{
|
|
||||||
type:'bar',
|
|
||||||
data:{{
|
|
||||||
labels: DATES,
|
|
||||||
datasets:[
|
|
||||||
{{ label:'Genuine gaps', data:N_GENUINE,
|
|
||||||
backgroundColor:'#ef444488', borderColor:'#ef4444', borderWidth:1, borderRadius:2 }},
|
|
||||||
{{ label:'Likely lags', data:N_LAG,
|
|
||||||
backgroundColor:'#f59e0b88', borderColor:'#f59e0b', borderWidth:1, borderRadius:2 }},
|
|
||||||
]
|
|
||||||
}},
|
|
||||||
options:{{
|
|
||||||
responsive:true, maintainAspectRatio:false,
|
|
||||||
interaction:{{mode:'index',intersect:false}},
|
|
||||||
plugins:{{
|
|
||||||
legend:{{position:'top',labels:{{boxWidth:12,padding:16}}}},
|
|
||||||
tooltip:tip()
|
|
||||||
}},
|
|
||||||
scales:{{ x:xAxis(), y:{{...yAxis('# (isin, month) pairs'), stacked:true}} }},
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
new Chart(document.getElementById('chartMissing'), {{
|
|
||||||
type:'bar',
|
|
||||||
data:{{
|
|
||||||
labels: DATES,
|
|
||||||
datasets:[{{ label:'|Missing flow| (shares)', data:TOT_MISS,
|
|
||||||
backgroundColor:'#dc262688', borderColor:'#dc2626',
|
|
||||||
borderWidth:1, borderRadius:2 }}]
|
|
||||||
}},
|
|
||||||
options:{{
|
|
||||||
responsive:true, maintainAspectRatio:false,
|
|
||||||
plugins:{{legend:{{display:false}}, tooltip:tip()}},
|
|
||||||
scales:{{ x:xAxis(), y:yAxis('Shares') }}
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
new Chart(document.getElementById('chartIsinTs'), {{
|
|
||||||
type:'line',
|
|
||||||
data:{{ labels:ALL_DATES, datasets:ISIN_TS }},
|
|
||||||
options:{{
|
|
||||||
responsive:true, maintainAspectRatio:false,
|
|
||||||
interaction:{{mode:'index',intersect:false}},
|
|
||||||
plugins:{{
|
|
||||||
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
|
|
||||||
tooltip:tip()
|
|
||||||
}},
|
|
||||||
scales:{{ x:xAxis(), y:yAxis('Missing (%)') }}
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
// ── Aggregate charts ─────────────────────────────────────────
|
|
||||||
const AGG_DATES = {agg_dates_str};
|
|
||||||
const AGG_DELTA = {agg_delta_js};
|
|
||||||
const AGG_FLOW = {agg_flow_js};
|
|
||||||
const AGG_MISSING = {agg_missing_js};
|
|
||||||
const AGG_PCT = {agg_pct_js};
|
|
||||||
const ALPHA = {alpha};
|
|
||||||
|
|
||||||
// Color each bar: red if broken, amber if lag, else subtle blue
|
|
||||||
const aggPctColors = AGG_PCT.map(v =>
|
|
||||||
Math.abs(v) > ALPHA * 100 ? '#ef444488' : '#3b82f622'
|
|
||||||
);
|
|
||||||
const aggPctBorders = AGG_PCT.map(v =>
|
|
||||||
Math.abs(v) > ALPHA * 100 ? '#ef4444' : '#3b82f655'
|
|
||||||
);
|
|
||||||
|
|
||||||
// Overlay: ΔAUM vs total flow
|
|
||||||
new Chart(document.getElementById('chartAggOverlay'), {{
|
|
||||||
type: 'line',
|
|
||||||
data: {{
|
|
||||||
labels: AGG_DATES,
|
|
||||||
datasets: [
|
|
||||||
{{ label: 'ΔAUM (Σ Q(t) − Σ Q(t−1))',
|
|
||||||
data: AGG_DELTA, borderColor: '#3b82f6', backgroundColor: '#3b82f622',
|
|
||||||
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
|
|
||||||
{{ label: 'Σ Net flows recorded',
|
|
||||||
data: AGG_FLOW, borderColor: '#10b981', backgroundColor: '#10b98122',
|
|
||||||
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: false }},
|
|
||||||
]
|
|
||||||
}},
|
|
||||||
options: {{
|
|
||||||
responsive: true, maintainAspectRatio: false,
|
|
||||||
interaction: {{mode:'index', intersect:false}},
|
|
||||||
plugins: {{
|
|
||||||
legend: {{position:'top', labels:{{boxWidth:12, padding:16}}}},
|
|
||||||
tooltip: tip()
|
|
||||||
}},
|
|
||||||
scales: {{ x: xAxis(), y: yAxis('Shares') }}
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
// Missing flow bar
|
|
||||||
new Chart(document.getElementById('chartAggMissing'), {{
|
|
||||||
type: 'bar',
|
|
||||||
data: {{
|
|
||||||
labels: AGG_DATES,
|
|
||||||
datasets: [{{ label: 'Missing flow (shares)', data: AGG_MISSING,
|
|
||||||
backgroundColor: AGG_MISSING.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
|
|
||||||
borderColor: AGG_MISSING.map(v => v < 0 ? '#ef4444' : '#f59e0b'),
|
|
||||||
borderWidth: 1, borderRadius: 2 }}]
|
|
||||||
}},
|
|
||||||
options: {{
|
|
||||||
responsive: true, maintainAspectRatio: false,
|
|
||||||
plugins: {{legend:{{display:false}}, tooltip: tip()}},
|
|
||||||
scales: {{ x: xAxis(), y: yAxis('Shares') }}
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
// Missing % bar, coloured by threshold
|
|
||||||
new Chart(document.getElementById('chartAggPct'), {{
|
|
||||||
type: 'bar',
|
|
||||||
data: {{
|
|
||||||
labels: AGG_DATES,
|
|
||||||
datasets: [{{ label: 'Missing % of movement', data: AGG_PCT,
|
|
||||||
backgroundColor: aggPctColors, borderColor: aggPctBorders,
|
|
||||||
borderWidth: 1, borderRadius: 2 }}]
|
|
||||||
}},
|
|
||||||
options: {{
|
|
||||||
responsive: true, maintainAspectRatio: false,
|
|
||||||
plugins: {{
|
|
||||||
legend: {{display:false}},
|
|
||||||
tooltip: tip(),
|
|
||||||
annotation: {{}} // threshold line handled via color
|
|
||||||
}},
|
|
||||||
scales: {{ x: xAxis(), y: {{...yAxis('Missing (%)'), min: 0}} }}
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>"""
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
# 5. MAIN
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Detect broken months in Carmignac AUM/Flows data"
|
|
||||||
)
|
|
||||||
parser.add_argument("--aum", default="AUM_head.csv")
|
|
||||||
parser.add_argument("--flows", default="flows_head.csv")
|
|
||||||
parser.add_argument("--out", default="carmignac_broken_months.csv",
|
|
||||||
help="Machine-readable output (loaded by carmignac_repair.py)")
|
|
||||||
parser.add_argument("--html", default="carmignac_diagnostics.html")
|
|
||||||
parser.add_argument("--alpha", type=float, default=0.02,
|
|
||||||
help="Tolerance threshold (default 0.02 = 2%%)")
|
|
||||||
parser.add_argument("--lag", type=int, default=3,
|
|
||||||
help="Boundary days to test for accounting lag (default 3)")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
def resolve(p):
|
|
||||||
if os.path.exists(p): return p
|
|
||||||
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
|
|
||||||
if os.path.exists(alt): return alt
|
|
||||||
sys.exit(f"[ERROR] File not found: {p}")
|
|
||||||
|
|
||||||
print(f"[Load] AUM : {args.aum}")
|
|
||||||
print(f"[Load] Flows : {args.flows}")
|
|
||||||
aum, flows = load_data(resolve(args.aum), resolve(args.flows))
|
|
||||||
|
|
||||||
print(f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)...")
|
|
||||||
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
|
||||||
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
|
||||||
|
|
||||||
print_summary(df_broken, df_all, args.alpha)
|
|
||||||
|
|
||||||
n_agg_broken = int(df_agg["broken"].sum())
|
|
||||||
print(f" Aggregate broken months : {n_agg_broken} "
|
|
||||||
f"(of which lags: {int(df_agg['is_lag'].sum())})")
|
|
||||||
|
|
||||||
# CSV output — this is what carmignac_repair.py will load
|
|
||||||
if len(df_broken):
|
|
||||||
df_broken.to_csv(args.out, index=False)
|
|
||||||
print(f"[Export] Broken months CSV → {args.out}")
|
|
||||||
else:
|
|
||||||
pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
|
|
||||||
print(f"[Export] No broken months — empty CSV → {args.out}")
|
|
||||||
|
|
||||||
html = build_html(df_broken, df_all, df_agg, args.alpha)
|
|
||||||
with open(args.html, "w", encoding="utf-8") as f:
|
|
||||||
f.write(html)
|
|
||||||
print(f"[Export] HTML report → {args.html}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
1346
data/explore.ipynb
1346
data/explore.ipynb
File diff suppressed because it is too large
Load Diff
|
|
@ -27,7 +27,8 @@ import pandas as pd
|
||||||
# 1. LOAD & VALIDATE
|
# 1. LOAD & VALIDATE
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def load_outputs(scores_path, mapping_path, surgery_path):
|
def load_outputs(scores_path, mapping_path, surgery_path,
|
||||||
|
err_isin_path=None, err_agg_path=None):
|
||||||
scores = pd.read_csv(scores_path, parse_dates=["date"])
|
scores = pd.read_csv(scores_path, parse_dates=["date"])
|
||||||
mapping = pd.read_csv(mapping_path, parse_dates=["date"])
|
mapping = pd.read_csv(mapping_path, parse_dates=["date"])
|
||||||
surgery = pd.read_csv(surgery_path, parse_dates=["date"])
|
surgery = pd.read_csv(surgery_path, parse_dates=["date"])
|
||||||
|
|
@ -40,9 +41,44 @@ def load_outputs(scores_path, mapping_path, surgery_path):
|
||||||
surgery["reg_orig"] = surgery["reg_orig"].astype(str)
|
surgery["reg_orig"] = surgery["reg_orig"].astype(str)
|
||||||
surgery["reg_from"] = surgery["reg_from"].astype(str)
|
surgery["reg_from"] = surgery["reg_from"].astype(str)
|
||||||
surgery["reg_to"] = surgery["reg_to"].astype(str)
|
surgery["reg_to"] = surgery["reg_to"].astype(str)
|
||||||
|
if "lookback_months" not in surgery.columns:
|
||||||
|
surgery["lookback_months"] = 1 # backwards compat
|
||||||
|
|
||||||
return scores, mapping, surgery
|
# Error account (optional)
|
||||||
|
err_isin = None
|
||||||
|
err_agg = None
|
||||||
|
if err_isin_path and os.path.exists(err_isin_path):
|
||||||
|
err_isin = pd.read_csv(err_isin_path, parse_dates=["date"])
|
||||||
|
err_isin["isin"] = err_isin["isin"].astype(str)
|
||||||
|
if err_agg_path and os.path.exists(err_agg_path):
|
||||||
|
err_agg = pd.read_csv(err_agg_path, parse_dates=["date"])
|
||||||
|
|
||||||
|
return scores, mapping, surgery, err_isin, err_agg
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# 1b. LOAD ERROR ACCOUNT (optional)
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_error_account(isin_path, agg_path):
|
||||||
|
"""
|
||||||
|
Loads the error account CSVs produced by carmignac_diagnostics.py.
|
||||||
|
Returns (df_err_isin, df_err_agg) or (None, None) if files not found.
|
||||||
|
"""
|
||||||
|
if not isin_path or not agg_path:
|
||||||
|
return None, None
|
||||||
|
try:
|
||||||
|
ei = pd.read_csv(isin_path, parse_dates=["date"])
|
||||||
|
ea = pd.read_csv(agg_path, parse_dates=["date"])
|
||||||
|
ei["isin"] = ei["isin"].astype(str)
|
||||||
|
print(f"[Load] error account (ISIN) : {len(ei)} rows, "
|
||||||
|
f"{ei['isin'].nunique()} ISINs")
|
||||||
|
print(f"[Load] error account (agg) : {len(ea)} rows")
|
||||||
|
return ei, ea
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Could not load error account: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
# 2. COMPUTE ANALYTICS
|
# 2. COMPUTE ANALYTICS
|
||||||
|
|
@ -195,7 +231,7 @@ def print_summary(analytics, surgery):
|
||||||
# 4. BUILD HTML REPORT
|
# 4. BUILD HTML REPORT
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def build_html(analytics, surgery, scores, mapping):
|
def build_html(analytics, surgery, scores, mapping, df_err_isin=None, df_err_agg=None):
|
||||||
tl = analytics["timeline"]
|
tl = analytics["timeline"]
|
||||||
ss = analytics["surgery_stats"]
|
ss = analytics["surgery_stats"]
|
||||||
piv = analytics["pivot"]
|
piv = analytics["pivot"]
|
||||||
|
|
@ -257,14 +293,212 @@ def build_html(analytics, surgery, scores, mapping):
|
||||||
|
|
||||||
traj_json = json.dumps(traj_datasets)
|
traj_json = json.dumps(traj_datasets)
|
||||||
|
|
||||||
|
# ── 4.2b Error account data (optional) ────────────────────
|
||||||
|
has_error = df_err_isin is not None and df_err_agg is not None
|
||||||
|
|
||||||
|
if has_error:
|
||||||
|
err_dates = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])]
|
||||||
|
err_agg_stock = [round(float(v), 3) if not pd.isna(v) else None
|
||||||
|
for v in df_err_agg["stock_error_agg"].values]
|
||||||
|
err_agg_res = [round(float(v), 3) if not pd.isna(v) else None
|
||||||
|
for v in df_err_agg["residual_agg"].values]
|
||||||
|
err_agg_pct = [round(float(v), 4) if not pd.isna(v) else None
|
||||||
|
for v in df_err_agg["stock_error_agg_pct"].values]
|
||||||
|
|
||||||
|
# Top 5 ISINs by max |stock error|
|
||||||
|
top_err = (df_err_isin.groupby("isin")["stock_error"]
|
||||||
|
.apply(lambda x: x.abs().max())
|
||||||
|
.nlargest(5).index.tolist())
|
||||||
|
all_err_dates = sorted(df_err_isin["date"].unique())
|
||||||
|
ERR_COLORS = ["#ef4444","#f59e0b","#8b5cf6","#06b6d4","#10b981"]
|
||||||
|
err_isin_ds = []
|
||||||
|
for idx, isin in enumerate(top_err):
|
||||||
|
sub = (df_err_isin[df_err_isin["isin"] == isin]
|
||||||
|
.set_index("date")["stock_error"]
|
||||||
|
.reindex(all_err_dates))
|
||||||
|
err_isin_ds.append({
|
||||||
|
"label": isin,
|
||||||
|
"data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values],
|
||||||
|
"borderColor": ERR_COLORS[idx % len(ERR_COLORS)],
|
||||||
|
"backgroundColor": ERR_COLORS[idx % len(ERR_COLORS)] + "22",
|
||||||
|
"borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False,
|
||||||
|
})
|
||||||
|
|
||||||
|
max_err_stock = float(df_err_agg["stock_error_agg"].abs().max())
|
||||||
|
max_err_pct = float(df_err_agg["stock_error_agg_pct"].max())
|
||||||
|
agg_std = float(df_err_agg["stock_error_agg"].std())
|
||||||
|
agg_mean = float(df_err_agg["stock_error_agg"].abs().mean())
|
||||||
|
stationarity = round(agg_std / max(agg_mean, 1e-9), 3)
|
||||||
|
|
||||||
|
err_dates_js = json.dumps(err_dates)
|
||||||
|
err_agg_stock_js = json.dumps(err_agg_stock)
|
||||||
|
err_agg_res_js = json.dumps(err_agg_res)
|
||||||
|
err_agg_pct_js = json.dumps(err_agg_pct)
|
||||||
|
err_isin_ds_js = json.dumps(err_isin_ds)
|
||||||
|
err_isin_dates_js = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime")
|
||||||
|
else str(d)[:10] for d in all_err_dates])
|
||||||
|
|
||||||
|
# ISIN detail table (top 100 worst)
|
||||||
|
err_rows = []
|
||||||
|
for _, r in (df_err_isin.assign(abs_s=df_err_isin["stock_error"].abs())
|
||||||
|
.sort_values("abs_s", ascending=False)
|
||||||
|
.head(100).iterrows()):
|
||||||
|
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
|
||||||
|
sc = "color:var(--danger)" if r["stock_error"] < 0 else "color:var(--warn)"
|
||||||
|
rc = "color:var(--danger)" if r["residual"] < 0 else "color:var(--warn)"
|
||||||
|
pch = "color:var(--danger);font-weight:600" if r["stock_error_pct"] > 5 else ("color:var(--warn)" if r["stock_error_pct"] > 1 else "")
|
||||||
|
err_rows.append(
|
||||||
|
f'<tr><td>{ds}</td>'
|
||||||
|
f'<td style="font-family:var(--mono)">{r["isin"]}</td>'
|
||||||
|
f'<td style="text-align:right;font-family:var(--mono);{rc}">{r["residual"]:+,.2f}</td>'
|
||||||
|
f'<td style="text-align:right;font-family:var(--mono);{sc}">{r["stock_error"]:+,.2f}</td>'
|
||||||
|
f'<td style="text-align:right;font-family:var(--mono);{pch}">{r["stock_error_pct"]:.3f}%</td>'
|
||||||
|
f'</tr>'
|
||||||
|
)
|
||||||
|
err_isin_detail = "".join(err_rows) if err_rows else (
|
||||||
|
'<tr><td colspan="5" style="padding:24px;text-align:center;color:var(--accent2)'
|
||||||
|
';font-family:var(--mono)">✓ Error account is flat</td></tr>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# HTML block for error account section
|
||||||
|
err_section_html = f"""
|
||||||
|
<div class="section-label">06 · Error Account</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Aggregate error account stock</span>
|
||||||
|
<span class="card-desc">
|
||||||
|
Stock_error(t_ref) = 0. The stock absorbs unreconciled residuals going backwards.
|
||||||
|
A flat signal near zero = clean data. A drift = structural gap.
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding-bottom:8px">
|
||||||
|
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1px;background:var(--border);margin-bottom:20px">
|
||||||
|
<div style="background:var(--surface);padding:14px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max |error stock|</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:var(--danger)">{max_err_stock:,.1f} shares</div>
|
||||||
|
</div>
|
||||||
|
<div style="background:var(--surface);padding:14px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max % of total AUM</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:{'var(--danger)' if max_err_pct > 5 else 'var(--warn)'}">{max_err_pct:.3f}%</div>
|
||||||
|
</div>
|
||||||
|
<div style="background:var(--surface);padding:14px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Stationarity σ/μ</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.35rem;font-weight:700;color:{'var(--accent2)' if stationarity < 1 else 'var(--warn)'}">{stationarity:.3f}</div>
|
||||||
|
<div style="font-size:.7rem;color:var(--muted);font-family:var(--mono)">lower = more stationary</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="chart-wrap-tall"><canvas id="chartErrStock"></canvas></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid-2">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Monthly aggregate residual</span>
|
||||||
|
<span class="card-desc">ΔQ_total − F_total per month</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body"><div class="chart-wrap"><canvas id="chartErrRes"></canvas></div></div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Error stock — top 5 ISINs</span>
|
||||||
|
<span class="card-desc">Cumulative error stock per ISIN</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body"><div class="chart-wrap"><canvas id="chartErrIsin"></canvas></div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Error account detail — worst (ISIN, month) pairs</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding:0">
|
||||||
|
<table>
|
||||||
|
<thead><tr>
|
||||||
|
<th>Date</th><th>ISIN</th>
|
||||||
|
<th style="text-align:right">Monthly residual</th>
|
||||||
|
<th style="text-align:right">Cumul. stock</th>
|
||||||
|
<th style="text-align:right">% of max AUM</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>{err_isin_detail}</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>"""
|
||||||
|
|
||||||
|
# JS block for error account charts
|
||||||
|
err_js_block = f"""
|
||||||
|
// ── 8. Error account charts ──────────────────────────────────
|
||||||
|
const ERR_DATES = {err_dates_js};
|
||||||
|
const ERR_AGG_STOCK = {err_agg_stock_js};
|
||||||
|
const ERR_AGG_RES = {err_agg_res_js};
|
||||||
|
const ERR_ISIN_TS = {err_isin_ds_js};
|
||||||
|
const ERR_ISIN_DATES = {err_isin_dates_js};
|
||||||
|
|
||||||
|
new Chart(document.getElementById('chartErrStock'), {{
|
||||||
|
type: 'line',
|
||||||
|
data: {{ labels: ERR_DATES, datasets: [{{
|
||||||
|
label: 'Aggregate error stock', data: ERR_AGG_STOCK,
|
||||||
|
borderColor: '#ef4444', backgroundColor: '#ef444415',
|
||||||
|
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true
|
||||||
|
}}] }},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
interaction: {{mode:'index', intersect:false}},
|
||||||
|
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
|
||||||
|
scales: {{ x: timeAxis(), y: {{
|
||||||
|
...yAxis('Shares'),
|
||||||
|
grid: {{ color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030',
|
||||||
|
lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1 }}
|
||||||
|
}} }}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
new Chart(document.getElementById('chartErrRes'), {{
|
||||||
|
type: 'bar',
|
||||||
|
data: {{ labels: ERR_DATES, datasets: [{{
|
||||||
|
label: 'Monthly residual', data: ERR_AGG_RES,
|
||||||
|
backgroundColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef444488' : '#f59e0b88'),
|
||||||
|
borderColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef4444' : '#f59e0b'),
|
||||||
|
borderWidth: 1, borderRadius: 2
|
||||||
|
}}] }},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
|
||||||
|
scales: {{ x: timeAxis(), y: yAxis('Shares') }}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
new Chart(document.getElementById('chartErrIsin'), {{
|
||||||
|
type: 'line',
|
||||||
|
data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
interaction: {{mode:'index', intersect:false}},
|
||||||
|
plugins: {{
|
||||||
|
legend: {{position:'right', labels:{{boxWidth:10, padding:8, font:{{size:10}}}}}},
|
||||||
|
tooltip: tooltip()
|
||||||
|
}},
|
||||||
|
scales: {{ x: timeAxis(), y: yAxis('Error stock (shares)') }}
|
||||||
|
}}
|
||||||
|
}});"""
|
||||||
|
|
||||||
|
else:
|
||||||
|
err_section_html = ""
|
||||||
|
err_js_block = ""
|
||||||
|
|
||||||
# ── 4.3 Surgery detail table rows ──────────────────────────
|
# ── 4.3 Surgery detail table rows ──────────────────────────
|
||||||
sd = analytics["surgery_detail"].sort_values("date")
|
sd = analytics["surgery_detail"].sort_values("date")
|
||||||
surg_rows_html = ""
|
surg_rows_html = ""
|
||||||
if len(sd) == 0:
|
if len(sd) == 0:
|
||||||
surg_rows_html = "<tr><td colspan='8' style='text-align:center;color:#888'>No surgeries performed</td></tr>"
|
surg_rows_html = "<tr><td colspan='9' style='text-align:center;color:#888'>No surgeries performed</td></tr>"
|
||||||
else:
|
else:
|
||||||
for _, r in sd.iterrows():
|
for _, r in sd.iterrows():
|
||||||
gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low"
|
gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low"
|
||||||
|
lb = int(r.get("lookback_months", 1))
|
||||||
|
lb_cell = (f'<span style="font-family:var(--mono);font-size:.65rem;padding:1px 5px;'
|
||||||
|
f'border-radius:3px;background:#7c3aed22;border:1px solid #7c3aed55;'
|
||||||
|
f'color:#a78bfa">{lb}m</span>' if lb > 1 else "—")
|
||||||
surg_rows_html += f"""
|
surg_rows_html += f"""
|
||||||
<tr>
|
<tr>
|
||||||
<td>{r['date'].date()}</td>
|
<td>{r['date'].date()}</td>
|
||||||
|
|
@ -275,6 +509,7 @@ def build_html(analytics, surgery, scores, mapping):
|
||||||
<td>{r['jaccard_composite']:.4f}</td>
|
<td>{r['jaccard_composite']:.4f}</td>
|
||||||
<td class="{gain_class}">+{r['gain_vs_no_surgery']:.6f}</td>
|
<td class="{gain_class}">+{r['gain_vs_no_surgery']:.6f}</td>
|
||||||
<td>{r['gain_pct_of_score']:.1f}%</td>
|
<td>{r['gain_pct_of_score']:.1f}%</td>
|
||||||
|
<td>{lb_cell}</td>
|
||||||
</tr>"""
|
</tr>"""
|
||||||
|
|
||||||
# ── 4.4 Top accounts table ──────────────────────────────────
|
# ── 4.4 Top accounts table ──────────────────────────────────
|
||||||
|
|
@ -857,6 +1092,7 @@ def build_html(analytics, surgery, scores, mapping):
|
||||||
<th>Jaccard</th>
|
<th>Jaccard</th>
|
||||||
<th>Score gain</th>
|
<th>Score gain</th>
|
||||||
<th>% of score</th>
|
<th>% of score</th>
|
||||||
|
<th>Lookback</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>{surg_rows_html}</tbody>
|
<tbody>{surg_rows_html}</tbody>
|
||||||
|
|
@ -887,6 +1123,9 @@ def build_html(analytics, surgery, scores, mapping):
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{err_section_html}
|
||||||
|
|
||||||
|
|
||||||
</div><!-- /main -->
|
</div><!-- /main -->
|
||||||
|
|
||||||
<div class="footer">Generated by carmignac_analysis.py · Carmignac × ENSAE Data Challenge 2025</div>
|
<div class="footer">Generated by carmignac_analysis.py · Carmignac × ENSAE Data Challenge 2025</div>
|
||||||
|
|
@ -1297,6 +1536,7 @@ new Chart(document.getElementById('chartJaccard'), {{
|
||||||
}},
|
}},
|
||||||
}},
|
}},
|
||||||
}});
|
}});
|
||||||
|
{err_js_block}
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>"""
|
</html>"""
|
||||||
|
|
@ -1314,32 +1554,49 @@ def main():
|
||||||
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
|
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
|
||||||
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
|
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
|
||||||
parser.add_argument("--out", default="repair_results/carmignac_report.html")
|
parser.add_argument("--out", default="repair_results/carmignac_report.html")
|
||||||
|
parser.add_argument("--error-account-isin", default=None,
|
||||||
|
dest="error_isin",
|
||||||
|
help="Path to carmignac_error_account.csv (optional)")
|
||||||
|
parser.add_argument("--error-account-agg", default=None,
|
||||||
|
dest="error_agg",
|
||||||
|
help="Path to carmignac_error_account_agg.csv (optional)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Resolve paths relative to this script's directory if files not found
|
# Resolve paths relative to this script's directory if files not found
|
||||||
base = os.path.dirname(os.path.abspath(__file__))
|
base = os.path.dirname(os.path.abspath(__file__))
|
||||||
def resolve(p):
|
def resolve(p, required=True):
|
||||||
|
if p is None:
|
||||||
|
return None
|
||||||
if os.path.exists(p):
|
if os.path.exists(p):
|
||||||
return p
|
return p
|
||||||
alt = os.path.join(base, p)
|
alt = os.path.join(base, p)
|
||||||
if os.path.exists(alt):
|
if os.path.exists(alt):
|
||||||
return alt
|
return alt
|
||||||
|
if required:
|
||||||
sys.exit(f"[ERROR] File not found: {p}")
|
sys.exit(f"[ERROR] File not found: {p}")
|
||||||
|
print(f"[WARN] Optional file not found: {p}")
|
||||||
|
return None
|
||||||
|
|
||||||
scores_path = resolve(args.scores)
|
scores_path = resolve(args.scores)
|
||||||
mapping_path = resolve(args.mapping)
|
mapping_path = resolve(args.mapping)
|
||||||
surgery_path = resolve(args.surgery)
|
surgery_path = resolve(args.surgery)
|
||||||
|
error_isin_path = resolve(args.error_isin, required=False)
|
||||||
|
error_agg_path = resolve(args.error_agg, required=False)
|
||||||
|
|
||||||
print(f"[Load] scores : {scores_path}")
|
print(f"[Load] scores : {scores_path}")
|
||||||
print(f"[Load] mapping : {mapping_path}")
|
print(f"[Load] mapping : {mapping_path}")
|
||||||
print(f"[Load] surgery : {surgery_path}")
|
print(f"[Load] surgery : {surgery_path}")
|
||||||
|
|
||||||
scores, mapping, surgery = load_outputs(scores_path, mapping_path, surgery_path)
|
scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs(
|
||||||
|
scores_path, mapping_path, surgery_path,
|
||||||
|
err_isin_path=error_isin_path, err_agg_path=error_agg_path
|
||||||
|
)
|
||||||
analytics = compute_analytics(scores, mapping, surgery)
|
analytics = compute_analytics(scores, mapping, surgery)
|
||||||
|
|
||||||
print_summary(analytics, surgery)
|
print_summary(analytics, surgery)
|
||||||
|
|
||||||
html = build_html(analytics, surgery, scores, mapping)
|
html = build_html(analytics, surgery, scores, mapping,
|
||||||
|
df_err_isin=df_err_isin, df_err_agg=df_err_agg)
|
||||||
|
|
||||||
out_path = args.out
|
out_path = args.out
|
||||||
with open(out_path, "w", encoding="utf-8") as f:
|
with open(out_path, "w", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -36,13 +36,14 @@ import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
import s3fs
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
import s3fs
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
# 1. LOAD
|
# 1. LOAD
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
@ -217,6 +218,8 @@ def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||||
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
||||||
return df_broken, df_all
|
return df_broken, df_all
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
|
# 2b. AGGREGATE (CROSS-ISIN) BROKEN MONTHS
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
@ -319,6 +322,165 @@ def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||||
df_agg = pd.DataFrame(rows)
|
df_agg = pd.DataFrame(rows)
|
||||||
return df_agg
|
return df_agg
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# 2c. ERROR ACCOUNT
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def build_error_account(aum, flows, lag_days=3):
|
||||||
|
"""
|
||||||
|
Builds a synthetic "error account" that absorbs the stock-flow
|
||||||
|
residuals that cannot be explained by recorded flows.
|
||||||
|
|
||||||
|
Construction (backwards from t_ref):
|
||||||
|
Stock_error(t_ref) = 0 (by definition)
|
||||||
|
Stock_error(t-1) = Stock_error(t) - Residual(t)
|
||||||
|
|
||||||
|
where Residual(t) = [Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)] - Σ_r F_{r,s}(t)
|
||||||
|
for each ISIN s independently.
|
||||||
|
|
||||||
|
By construction, adding this error account to the AUM restores the
|
||||||
|
stock-flow equality at every (isin, month).
|
||||||
|
|
||||||
|
Also computes an aggregated error account (summed over all ISINs).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_err_isin : DataFrame with columns
|
||||||
|
(date, isin, residual, stock_error, stock_error_pct)
|
||||||
|
where stock_error_pct = stock_error / max(total_isin_aum, 1)
|
||||||
|
|
||||||
|
df_err_agg : DataFrame with columns
|
||||||
|
(date, residual_agg, stock_error_agg, stock_error_agg_pct)
|
||||||
|
"""
|
||||||
|
t_min = aum["Centralisation Date"].min()
|
||||||
|
t_max = aum["Centralisation Date"].max()
|
||||||
|
all_months = pd.date_range(t_min, t_max, freq="ME")
|
||||||
|
|
||||||
|
# ── ISIN-level AUM panel (forward-filled) ────────────────────
|
||||||
|
aum_agg = (
|
||||||
|
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
.rename(columns={"Product - Isin": "isin",
|
||||||
|
"Centralisation Date": "date",
|
||||||
|
"Quantity - AUM": "qty"})
|
||||||
|
)
|
||||||
|
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty")
|
||||||
|
aum_pivot = aum_pivot.reindex(all_months).ffill()
|
||||||
|
|
||||||
|
# ── ISIN-level flow aggregation (standard window) ─────────────
|
||||||
|
def bucket_isin_flows(flows_df, months):
|
||||||
|
fc = flows_df.copy()
|
||||||
|
def assign_month(d):
|
||||||
|
for m in months:
|
||||||
|
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||||
|
if eom_prev < d <= m:
|
||||||
|
return m
|
||||||
|
return pd.NaT
|
||||||
|
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
||||||
|
fc = fc.dropna(subset=["month_end"])
|
||||||
|
return (fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
|
||||||
|
.sum()
|
||||||
|
.unstack("Product - Isin")
|
||||||
|
.reindex(months)
|
||||||
|
.fillna(0.0))
|
||||||
|
|
||||||
|
flow_pivot = bucket_isin_flows(flows, all_months)
|
||||||
|
|
||||||
|
# ── Compute residuals per (isin, month) ───────────────────────
|
||||||
|
isins = aum_pivot.columns.tolist()
|
||||||
|
# residual[t] = delta_AUM[t] - flow[t]
|
||||||
|
residuals = {} # {isin: Series indexed by month}
|
||||||
|
|
||||||
|
for isin in isins:
|
||||||
|
res_series = {}
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
t_prev = all_months[i - 1]
|
||||||
|
q_curr = aum_pivot[isin].get(t_curr, np.nan)
|
||||||
|
q_prev = aum_pivot[isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_curr) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
delta = q_curr - q_prev
|
||||||
|
f = (flow_pivot[isin].get(t_curr, 0.0)
|
||||||
|
if isin in flow_pivot.columns else 0.0)
|
||||||
|
res_series[t_curr] = delta - f
|
||||||
|
residuals[isin] = pd.Series(res_series)
|
||||||
|
|
||||||
|
# ── Build error stock backwards from t_ref ────────────────────
|
||||||
|
t_ref = all_months[-1]
|
||||||
|
rows_isin = []
|
||||||
|
|
||||||
|
for isin in isins:
|
||||||
|
res = residuals[isin]
|
||||||
|
# Maximum AUM for this ISIN (for normalisation)
|
||||||
|
max_aum = aum_pivot[isin].max()
|
||||||
|
if pd.isna(max_aum) or max_aum < 1:
|
||||||
|
max_aum = 1.0
|
||||||
|
|
||||||
|
# Propagate backwards: stock(t_ref) = 0
|
||||||
|
stock = 0.0
|
||||||
|
# Build dict keyed by date
|
||||||
|
stock_by_date = {t_ref: 0.0}
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
t_prev = all_months[i]
|
||||||
|
r = res.get(t_curr, 0.0)
|
||||||
|
stock = stock - r
|
||||||
|
stock_by_date[t_prev] = stock
|
||||||
|
|
||||||
|
for t in all_months:
|
||||||
|
s = stock_by_date.get(t, np.nan)
|
||||||
|
r = res.get(t, 0.0)
|
||||||
|
rows_isin.append({
|
||||||
|
"date": t,
|
||||||
|
"isin": isin,
|
||||||
|
"residual": round(r, 4),
|
||||||
|
"stock_error": round(s, 4) if not pd.isna(s) else np.nan,
|
||||||
|
"stock_error_pct": round(abs(s) / max_aum * 100, 4)
|
||||||
|
if not pd.isna(s) else np.nan,
|
||||||
|
})
|
||||||
|
|
||||||
|
df_err_isin = pd.DataFrame(rows_isin).sort_values(["date", "isin"])
|
||||||
|
|
||||||
|
# ── Aggregated error account ──────────────────────────────────
|
||||||
|
# Total AUM across all ISINs at each month
|
||||||
|
total_aum_by_month = aum_pivot.sum(axis=1)
|
||||||
|
max_total_aum = total_aum_by_month.max()
|
||||||
|
if pd.isna(max_total_aum) or max_total_aum < 1:
|
||||||
|
max_total_aum = 1.0
|
||||||
|
|
||||||
|
# Aggregate residual = sum of ISIN residuals
|
||||||
|
agg_res = {}
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
total_r = sum(residuals[isin].get(t_curr, 0.0) for isin in isins)
|
||||||
|
agg_res[t_curr] = total_r
|
||||||
|
|
||||||
|
agg_stock = 0.0
|
||||||
|
agg_stock_by_date = {t_ref: 0.0}
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
t_prev = all_months[i]
|
||||||
|
agg_stock = agg_stock - agg_res.get(t_curr, 0.0)
|
||||||
|
agg_stock_by_date[t_prev] = agg_stock
|
||||||
|
|
||||||
|
rows_agg = []
|
||||||
|
for t in all_months:
|
||||||
|
s = agg_stock_by_date.get(t, np.nan)
|
||||||
|
r = agg_res.get(t, 0.0)
|
||||||
|
rows_agg.append({
|
||||||
|
"date": t,
|
||||||
|
"residual_agg": round(r, 4),
|
||||||
|
"stock_error_agg": round(s, 4) if not pd.isna(s) else np.nan,
|
||||||
|
"stock_error_agg_pct": round(abs(s) / max_total_aum * 100, 4)
|
||||||
|
if not pd.isna(s) else np.nan,
|
||||||
|
})
|
||||||
|
|
||||||
|
df_err_agg = pd.DataFrame(rows_agg).sort_values("date")
|
||||||
|
return df_err_isin, df_err_agg
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
# 3. PRINT SUMMARY
|
# 3. PRINT SUMMARY
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
@ -358,7 +520,7 @@ def print_summary(df_broken, df_all, alpha):
|
||||||
# 4. BUILD HTML REPORT
|
# 4. BUILD HTML REPORT
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def build_html(df_broken, df_all, df_agg, alpha):
|
def build_html(df_broken, df_all, df_agg, df_err_isin, df_err_agg, alpha):
|
||||||
# ── JS-ready data ────────────────────────────────────────────
|
# ── JS-ready data ────────────────────────────────────────────
|
||||||
# Timeline: n_broken and total_missing per month
|
# Timeline: n_broken and total_missing per month
|
||||||
tl = (df_all[df_all["broken"]]
|
tl = (df_all[df_all["broken"]]
|
||||||
|
|
@ -374,6 +536,11 @@ def build_html(df_broken, df_all, df_agg, alpha):
|
||||||
def jf(arr, dec=4):
|
def jf(arr, dec=4):
|
||||||
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
|
return json.dumps([round(float(v), dec) if not np.isnan(v) else None for v in arr])
|
||||||
|
|
||||||
|
ISIN_COLORS = [
|
||||||
|
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
|
||||||
|
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
|
||||||
|
]
|
||||||
|
|
||||||
n_broken_js = jf(tl["n_broken"].values, 0)
|
n_broken_js = jf(tl["n_broken"].values, 0)
|
||||||
total_miss_js = jf(tl["total_missing"].values)
|
total_miss_js = jf(tl["total_missing"].values)
|
||||||
n_lag_js = jf(tl["n_lag"].values, 0)
|
n_lag_js = jf(tl["n_lag"].values, 0)
|
||||||
|
|
@ -412,6 +579,65 @@ def build_html(df_broken, df_all, df_agg, alpha):
|
||||||
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
|
'color:var(--success);font-family:var(--mono)">✓ No broken months at aggregate level</td></tr>'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── Error account JS data ────────────────────────────────────
|
||||||
|
err_dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])])
|
||||||
|
err_agg_stock_js = jf(df_err_agg["stock_error_agg"].values)
|
||||||
|
err_agg_res_js = jf(df_err_agg["residual_agg"].values)
|
||||||
|
err_agg_pct_js = jf(df_err_agg["stock_error_agg_pct"].values)
|
||||||
|
|
||||||
|
# Top 5 ISINs by max absolute stock error
|
||||||
|
top_err_isins = (
|
||||||
|
df_err_isin.groupby("isin")["stock_error"]
|
||||||
|
.apply(lambda x: x.abs().max())
|
||||||
|
.nlargest(5).index.tolist()
|
||||||
|
)
|
||||||
|
all_err_dates = sorted(df_err_isin["date"].unique())
|
||||||
|
err_isin_datasets = []
|
||||||
|
for idx, isin in enumerate(top_err_isins):
|
||||||
|
sub = (df_err_isin[df_err_isin["isin"] == isin]
|
||||||
|
.set_index("date")["stock_error"]
|
||||||
|
.reindex(all_err_dates))
|
||||||
|
err_isin_datasets.append({
|
||||||
|
"label": isin,
|
||||||
|
"data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values],
|
||||||
|
"borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)],
|
||||||
|
"backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22",
|
||||||
|
"borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False,
|
||||||
|
})
|
||||||
|
err_isin_ts_json = json.dumps(err_isin_datasets)
|
||||||
|
err_isin_dates_str = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime")
|
||||||
|
else str(d)[:10] for d in all_err_dates])
|
||||||
|
|
||||||
|
# Error account KPIs
|
||||||
|
max_agg_stock_err = float(df_err_agg["stock_error_agg"].abs().max())
|
||||||
|
max_agg_stock_pct = float(df_err_agg["stock_error_agg_pct"].max())
|
||||||
|
# Stationarity proxy: std / mean_abs (lower = more stationary)
|
||||||
|
agg_std = float(df_err_agg["stock_error_agg"].std())
|
||||||
|
agg_mean = float(df_err_agg["stock_error_agg"].abs().mean())
|
||||||
|
stationarity = round(agg_std / max(agg_mean, 1e-9), 3)
|
||||||
|
|
||||||
|
# Error account ISIN detail table (worst months per ISIN)
|
||||||
|
err_worst = (df_err_isin.assign(abs_stock=df_err_isin["stock_error"].abs())
|
||||||
|
.sort_values("abs_stock", ascending=False)
|
||||||
|
.head(200))
|
||||||
|
err_isin_rows = []
|
||||||
|
for _, r in err_worst.iterrows():
|
||||||
|
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
|
||||||
|
sc = "miss-neg" if r["stock_error"] < 0 else "miss-pos"
|
||||||
|
rc = "miss-neg" if r["residual"] < 0 else "miss-pos"
|
||||||
|
pch = "pct-high" if r["stock_error_pct"] > 5 else ("pct-med" if r["stock_error_pct"] > 1 else "")
|
||||||
|
err_isin_rows.append(
|
||||||
|
f'<tr><td>{ds}</td>'
|
||||||
|
f'<td class="mono">{r["isin"]}</td>'
|
||||||
|
f'<td class="mono right {rc}">{r["residual"]:+,.2f}</td>'
|
||||||
|
f'<td class="mono right {sc}">{r["stock_error"]:+,.2f}</td>'
|
||||||
|
f'<td class="mono right {pch}">{r["stock_error_pct"]:.3f}%</td></tr>'
|
||||||
|
)
|
||||||
|
err_isin_detail = "".join(err_isin_rows) if err_isin_rows else (
|
||||||
|
'<tr><td colspan="5" style="padding:24px;text-align:center;'
|
||||||
|
'color:var(--success);font-family:var(--mono)">✓ Error account is flat (no residuals)</td></tr>'
|
||||||
|
)
|
||||||
|
|
||||||
# Per-ISIN summary
|
# Per-ISIN summary
|
||||||
isin_sum = (df_broken.groupby("isin")
|
isin_sum = (df_broken.groupby("isin")
|
||||||
.agg(n_months=("date", "count"),
|
.agg(n_months=("date", "count"),
|
||||||
|
|
@ -419,11 +645,6 @@ def build_html(df_broken, df_all, df_agg, alpha):
|
||||||
total_abs=("missing_flow", lambda x: x.abs().sum()))
|
total_abs=("missing_flow", lambda x: x.abs().sum()))
|
||||||
.sort_values("total_abs", ascending=False))
|
.sort_values("total_abs", ascending=False))
|
||||||
|
|
||||||
ISIN_COLORS = [
|
|
||||||
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
|
|
||||||
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Per-ISIN missing_pct timeseries for the top 5 ISINs
|
# Per-ISIN missing_pct timeseries for the top 5 ISINs
|
||||||
top_isins = isin_sum.head(5).index.tolist()
|
top_isins = isin_sum.head(5).index.tolist()
|
||||||
all_dates = sorted(df_all["date"].unique())
|
all_dates = sorted(df_all["date"].unique())
|
||||||
|
|
@ -618,7 +839,77 @@ def build_html(df_broken, df_all, df_agg, alpha):
|
||||||
|
|
||||||
<div class="main">
|
<div class="main">
|
||||||
|
|
||||||
<div class="section-label">00 · Aggregate view — all ISINs combined</div>
|
<div class="section-label">00 · Error account — cumulative residuals</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Aggregate error account stock over time</span>
|
||||||
|
<span class="card-desc">
|
||||||
|
Stock_error(t_ref) = 0 by definition. At each prior month, the stock absorbs the residual
|
||||||
|
[ΔQ_total − F_total]. A stationary signal near zero = clean data.
|
||||||
|
A drifting signal = structural data quality problem.
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding-bottom:8px">
|
||||||
|
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1px;background:var(--border);margin-bottom:20px">
|
||||||
|
<div style="background:var(--surface);padding:16px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max |stock error|</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:var(--danger)">{max_agg_stock_err:,.1f} shares</div>
|
||||||
|
</div>
|
||||||
|
<div style="background:var(--surface);padding:16px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Max % of total AUM</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:{'var(--danger)' if max_agg_stock_pct > 5 else 'var(--warn)'}">{max_agg_stock_pct:.3f}%</div>
|
||||||
|
</div>
|
||||||
|
<div style="background:var(--surface);padding:16px 20px">
|
||||||
|
<div style="font-family:var(--mono);font-size:.68rem;letter-spacing:.1em;text-transform:uppercase;color:var(--muted)">Stationarity (σ/μ)</div>
|
||||||
|
<div style="font-family:var(--mono);font-size:1.4rem;font-weight:700;color:{'var(--success)' if stationarity < 1 else 'var(--warn)'}">{stationarity:.3f}</div>
|
||||||
|
<div style="font-size:.7rem;color:var(--muted);font-family:var(--mono)">lower = more stationary</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="chart-wrap-tall"><canvas id="chartErrAggStock"></canvas></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid-2">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Monthly aggregate residual</span>
|
||||||
|
<span class="card-desc">ΔQ_total − F_total per month (should be near zero)</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="chart-wrap"><canvas id="chartErrAggRes"></canvas></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Error stock — top 5 ISINs</span>
|
||||||
|
<span class="card-desc">Cumulative error stock per ISIN (most affected)</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="chart-wrap"><canvas id="chartErrIsinTs"></canvas></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<span class="card-title">Error account detail — worst (ISIN, month) pairs</span>
|
||||||
|
<span class="card-desc">Sorted by absolute cumulative error stock. stock_error_pct = |stock| / max(ISIN AUM)</span>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding:0">
|
||||||
|
<table>
|
||||||
|
<thead><tr>
|
||||||
|
<th>Date</th><th>ISIN</th>
|
||||||
|
<th class="right">Monthly residual</th>
|
||||||
|
<th class="right">Cumulative stock</th>
|
||||||
|
<th class="right">% of max AUM</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>{err_isin_detail}</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section-label">01 · Aggregate view — all ISINs combined</div>
|
||||||
|
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">
|
<div class="card-header">
|
||||||
|
|
@ -832,6 +1123,75 @@ new Chart(document.getElementById('chartIsinTs'), {{
|
||||||
}}
|
}}
|
||||||
}});
|
}});
|
||||||
|
|
||||||
|
// ── Error account charts ─────────────────────────────────────
|
||||||
|
const ERR_DATES = {err_dates_str};
|
||||||
|
const ERR_AGG_STOCK = {err_agg_stock_js};
|
||||||
|
const ERR_AGG_RES = {err_agg_res_js};
|
||||||
|
const ERR_AGG_PCT = {err_agg_pct_js};
|
||||||
|
const ERR_ISIN_TS = {err_isin_ts_json};
|
||||||
|
const ERR_ISIN_DATES= {err_isin_dates_str};
|
||||||
|
|
||||||
|
// Aggregate error stock — line with zero reference
|
||||||
|
new Chart(document.getElementById('chartErrAggStock'), {{
|
||||||
|
type: 'line',
|
||||||
|
data: {{
|
||||||
|
labels: ERR_DATES,
|
||||||
|
datasets: [
|
||||||
|
{{ label: 'Aggregate error stock (shares)',
|
||||||
|
data: ERR_AGG_STOCK,
|
||||||
|
borderColor: '#ef4444', backgroundColor: '#ef444418',
|
||||||
|
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true }},
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
interaction: {{mode:'index', intersect:false}},
|
||||||
|
plugins: {{ legend:{{display:false}}, tooltip: tip() }},
|
||||||
|
scales: {{
|
||||||
|
x: xAxis(),
|
||||||
|
y: {{
|
||||||
|
...yAxis('Shares'),
|
||||||
|
grid: {{
|
||||||
|
color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030',
|
||||||
|
lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1,
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
// Monthly residual bar
|
||||||
|
new Chart(document.getElementById('chartErrAggRes'), {{
|
||||||
|
type: 'bar',
|
||||||
|
data: {{
|
||||||
|
labels: ERR_DATES,
|
||||||
|
datasets: [{{ label: 'Monthly residual (shares)', data: ERR_AGG_RES,
|
||||||
|
backgroundColor: ERR_AGG_RES.map(v => v < 0 ? '#ef444488' : '#f59e0b88'),
|
||||||
|
borderColor: ERR_AGG_RES.map(v => v < 0 ? '#ef4444' : '#f59e0b'),
|
||||||
|
borderWidth: 1, borderRadius: 2 }}]
|
||||||
|
}},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
plugins: {{legend:{{display:false}}, tooltip: tip()}},
|
||||||
|
scales: {{ x: xAxis(), y: yAxis('Shares') }}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
// Per-ISIN error stock timeseries
|
||||||
|
new Chart(document.getElementById('chartErrIsinTs'), {{
|
||||||
|
type: 'line',
|
||||||
|
data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }},
|
||||||
|
options: {{
|
||||||
|
responsive: true, maintainAspectRatio: false,
|
||||||
|
interaction: {{mode:'index', intersect:false}},
|
||||||
|
plugins: {{
|
||||||
|
legend:{{position:'right',labels:{{boxWidth:10,padding:8,font:{{size:10}}}}}},
|
||||||
|
tooltip: tip()
|
||||||
|
}},
|
||||||
|
scales: {{ x: xAxis(), y: yAxis('Error stock (shares)') }}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
// ── Aggregate charts ─────────────────────────────────────────
|
// ── Aggregate charts ─────────────────────────────────────────
|
||||||
const AGG_DATES = {agg_dates_str};
|
const AGG_DATES = {agg_dates_str};
|
||||||
const AGG_DELTA = {agg_delta_js};
|
const AGG_DELTA = {agg_delta_js};
|
||||||
|
|
@ -926,8 +1286,8 @@ def main():
|
||||||
parser.add_argument("--out", default="carmignac_broken_months.csv",
|
parser.add_argument("--out", default="carmignac_broken_months.csv",
|
||||||
help="Machine-readable output (loaded by carmignac_repair.py)")
|
help="Machine-readable output (loaded by carmignac_repair.py)")
|
||||||
parser.add_argument("--html", default="carmignac_diagnostics.html")
|
parser.add_argument("--html", default="carmignac_diagnostics.html")
|
||||||
parser.add_argument("--alpha", type=float, default=0.15,
|
parser.add_argument("--alpha", type=float, default=0.02,
|
||||||
help="Tolerance threshold (default 0.15 = 15%%)")
|
help="Tolerance threshold (default 0.02 = 2%%)")
|
||||||
parser.add_argument("--lag", type=int, default=3,
|
parser.add_argument("--lag", type=int, default=3,
|
||||||
help="Boundary days to test for accounting lag (default 3)")
|
help="Boundary days to test for accounting lag (default 3)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -948,11 +1308,17 @@ def main():
|
||||||
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
df_broken, df_all = detect_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
||||||
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
df_agg = detect_aggregate_broken_months(aum, flows, alpha=args.alpha, lag_days=args.lag)
|
||||||
|
|
||||||
|
print(f"\n[Error account] Building error account...")
|
||||||
|
df_err_isin, df_err_agg = build_error_account(aum, flows, lag_days=args.lag)
|
||||||
|
|
||||||
print_summary(df_broken, df_all, args.alpha)
|
print_summary(df_broken, df_all, args.alpha)
|
||||||
|
|
||||||
n_agg_broken = int(df_agg["broken"].sum())
|
n_agg_broken = int(df_agg["broken"].sum())
|
||||||
print(f" Aggregate broken months : {n_agg_broken} "
|
print(f" Aggregate broken months : {n_agg_broken} "
|
||||||
f"(of which lags: {int(df_agg['is_lag'].sum())})")
|
f"(of which lags: {int(df_agg['is_lag'].sum())})")
|
||||||
|
max_err = float(df_err_agg["stock_error_agg"].abs().max())
|
||||||
|
print(f" Max aggregate error stock : {max_err:,.1f} shares "
|
||||||
|
f"({float(df_err_agg['stock_error_agg_pct'].max()):.3f}% of total AUM)")
|
||||||
|
|
||||||
# CSV output — this is what carmignac_repair.py will load
|
# CSV output — this is what carmignac_repair.py will load
|
||||||
if len(df_broken):
|
if len(df_broken):
|
||||||
|
|
@ -962,7 +1328,15 @@ def main():
|
||||||
pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
|
pd.DataFrame(columns=["date","isin","missing_pct","is_lag"]).to_csv(args.out, index=False)
|
||||||
print(f"[Export] No broken months — empty CSV → {args.out}")
|
print(f"[Export] No broken months — empty CSV → {args.out}")
|
||||||
|
|
||||||
html = build_html(df_broken, df_all, df_agg, args.alpha)
|
# Error account CSV
|
||||||
|
err_out = args.out.replace("broken_months", "error_account")
|
||||||
|
df_err_isin.to_csv(err_out, index=False)
|
||||||
|
err_agg_out = err_out.replace("error_account", "error_account_agg")
|
||||||
|
df_err_agg.to_csv(err_agg_out, index=False)
|
||||||
|
print(f"[Export] Error account (ISIN) → {err_out}")
|
||||||
|
print(f"[Export] Error account (agg) → {err_agg_out}")
|
||||||
|
|
||||||
|
html = build_html(df_broken, df_all, df_agg, df_err_isin, df_err_agg, args.alpha)
|
||||||
with open(args.html, "w", encoding="utf-8") as f:
|
with open(args.html, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
print(f"[Export] HTML report → {args.html}")
|
print(f"[Export] HTML report → {args.html}")
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ Carmignac Data Challenge — Registrar ID Repair Pipeline
|
||||||
Étape 3 : Chirurgie de code (matching 1-to-1)
|
Étape 3 : Chirurgie de code (matching 1-to-1)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
@ -19,6 +20,8 @@ ALPHA = 0.05 # tolérance réconciliation : 5% du stock à t
|
||||||
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
|
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
|
||||||
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
|
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
|
||||||
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
|
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
|
||||||
|
MAX_SURGERY_LOOKBACK = 6 # remonter jusqu'à 6 mois en arrière pour trouver un candidat
|
||||||
|
SYMMETRY_ATTENUATION = 0.05 # facteur d'atténuation si rupture symétrique détectée (cas 1/3)
|
||||||
|
|
||||||
# ── Broken months ──────────────────────────────────────────────
|
# ── Broken months ──────────────────────────────────────────────
|
||||||
# Attenuation factor applied to reconciliation errors on months flagged
|
# Attenuation factor applied to reconciliation errors on months flagged
|
||||||
|
|
@ -35,6 +38,15 @@ BROKEN_MONTH_ATTENUATION = 0.2 # reduce error to 20% on broken months
|
||||||
# attenuated (same factor as broken months).
|
# attenuated (same factor as broken months).
|
||||||
LAG_ATTENUATION = 0.2 # reduce error to 20% on likely lag months
|
LAG_ATTENUATION = 0.2 # reduce error to 20% on likely lag months
|
||||||
|
|
||||||
|
# ── Fenêtre de chirurgie étendue ───────────────────────────────
|
||||||
|
# Quand aucun bon candidat n'est trouvé à t-1, la chirurgie remonte
|
||||||
|
# jusqu'à MAX_SURGERY_LOOKBACK mois en arrière. Pour chaque mois k
|
||||||
|
# supplémentaire, le score composite est multiplié par un facteur de
|
||||||
|
# confiance décroissant : confidence(k) = 1 - (k-1)/MAX_SURGERY_LOOKBACK.
|
||||||
|
# Le client suggère 6 mois (délai maximal de résolution des transferts
|
||||||
|
# asymétriques, lié au cycle de paiement des rétrocessions trimestrielles).
|
||||||
|
MAX_SURGERY_LOOKBACK = 6
|
||||||
|
|
||||||
EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
|
EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
|
||||||
|
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
|
|
@ -276,11 +288,69 @@ def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe
|
||||||
flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
|
flows_idx = monthly_flows.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
|
||||||
flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
|
flows_idx_lag = monthly_flows_lag.set_index(['date', 'reg_id', 'isin'])['qty_net_month']
|
||||||
|
|
||||||
|
# ── Pré-calcul des AUM agrégés par (isin, mois) pour détection de symétrie ──
|
||||||
|
# Pour chaque (isin, t), on calcule la somme des variations de stock par compte.
|
||||||
|
# Une rupture symétrique = un compte perd X parts sur un ISIN, un autre en gagne X.
|
||||||
|
# On détecte cela via le résidu net agrégé : si faible → symétrie probable.
|
||||||
|
# Structure : {(t_curr, isin) → {reg_id → delta_qty}}
|
||||||
|
# Calculé à la volée dans la boucle, pas en pré-calcul (trop mémoire pour 400 comptes).
|
||||||
|
|
||||||
# Remonter dans le temps
|
# Remonter dans le temps
|
||||||
for i in range(len(all_months) - 2, -1, -1):
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
t_prev = all_months[i]
|
t_prev = all_months[i]
|
||||||
t_curr = all_months[i + 1]
|
t_curr = all_months[i + 1]
|
||||||
|
|
||||||
|
# ── Détection de ruptures symétriques à ce pas de temps ──────
|
||||||
|
# Pour chaque ISIN, calculer la variation de stock par compte.
|
||||||
|
# Si la somme des variations positives ≈ somme des variations négatives
|
||||||
|
# → il y a probablement compensation (cas 1 ou 3, pas de perte nette).
|
||||||
|
# On stocke pour chaque (reg_id, isin) si sa rupture est symétrique.
|
||||||
|
symmetric_breaks = set() # ensemble de (reg_id, isin) à atténuer
|
||||||
|
|
||||||
|
for reg in panel.columns.get_level_values(0):
|
||||||
|
for isin in panel[reg].columns:
|
||||||
|
q_t = panel[reg][isin].get(t_curr, np.nan)
|
||||||
|
q_prev = panel[reg][isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_t) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
f = flows_idx.loc[(t_curr, reg, isin)]
|
||||||
|
except KeyError:
|
||||||
|
f = 0.0
|
||||||
|
residual = (q_t - q_prev) - f
|
||||||
|
if abs(residual) < ALPHA * max(abs(q_t), abs(q_prev), 1e-9):
|
||||||
|
continue # pas de rupture sur ce compte/ISIN
|
||||||
|
|
||||||
|
# Agrégation par ISIN : si le résidu net agrégé est petit,
|
||||||
|
# les ruptures individuelles se compensent → symétrie.
|
||||||
|
isin_residuals = {}
|
||||||
|
isin_total_abs = {}
|
||||||
|
for reg in panel.columns.get_level_values(0):
|
||||||
|
for isin in panel[reg].columns:
|
||||||
|
q_t = panel[reg][isin].get(t_curr, np.nan)
|
||||||
|
q_prev = panel[reg][isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_t) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
f = flows_idx.loc[(t_curr, reg, isin)]
|
||||||
|
except KeyError:
|
||||||
|
f = 0.0
|
||||||
|
residual = (q_t - q_prev) - f
|
||||||
|
denom = max(abs(q_t), abs(q_prev), 1e-9)
|
||||||
|
err = abs(residual) / denom
|
||||||
|
if err < ALPHA:
|
||||||
|
continue
|
||||||
|
isin_residuals[isin] = isin_residuals.get(isin, 0.0) + residual
|
||||||
|
isin_total_abs[isin] = isin_total_abs.get(isin, 0.0) + abs(residual)
|
||||||
|
|
||||||
|
# Un ISIN est "symétrique" si le résidu net < 20% du résidu brut total
|
||||||
|
# (les erreurs individuelles s'annulent en grande partie)
|
||||||
|
symmetric_isins = set()
|
||||||
|
for isin, net in isin_residuals.items():
|
||||||
|
total = isin_total_abs.get(isin, 0.0)
|
||||||
|
if total > 0 and abs(net) / total < 0.20:
|
||||||
|
symmetric_isins.add(isin)
|
||||||
|
|
||||||
errors_at_t = {}
|
errors_at_t = {}
|
||||||
new_scores = {}
|
new_scores = {}
|
||||||
|
|
||||||
|
|
@ -335,12 +405,15 @@ def score_propagation(panel, monthly_flows, monthly_flows_lag, weights, universe
|
||||||
qty_t_prev, qty_t, net_flow, alpha=ALPHA
|
qty_t_prev, qty_t, net_flow, alpha=ALPHA
|
||||||
)
|
)
|
||||||
|
|
||||||
# ── Attenuation on broken / lag months ──────────────
|
# ── Attenuation on broken / lag / symmetric months ───
|
||||||
# If this (isin, month) is flagged as broken at market
|
# Priority: symmetric > broken > lag
|
||||||
# level, the error is not the account's fault — attenuate.
|
|
||||||
if err_ratio > 0:
|
if err_ratio > 0:
|
||||||
key = (t_curr, isin)
|
key = (t_curr, isin)
|
||||||
if key in broken_months or key in lag_months:
|
if isin in symmetric_isins:
|
||||||
|
# Rupture compensée à l'agrégé → cas 1 ou 3,
|
||||||
|
# pas de perte nette de données → atténuation forte
|
||||||
|
err_ratio = err_ratio * SYMMETRY_ATTENUATION
|
||||||
|
elif key in broken_months or key in lag_months:
|
||||||
# Try lag-window flow to distinguish lag vs genuine gap
|
# Try lag-window flow to distinguish lag vs genuine gap
|
||||||
try:
|
try:
|
||||||
net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
|
net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
|
||||||
|
|
@ -590,17 +663,26 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
|
||||||
|
|
||||||
# ── Candidats disponibles ──
|
# ── Candidats disponibles ──
|
||||||
# On exclut les codes déjà mappés à un autre compte,
|
# On exclut les codes déjà mappés à un autre compte,
|
||||||
# mais reg_curr lui-même est un candidat valide (self-mapping :
|
# mais reg_curr lui-même est un candidat valide (self-mapping).
|
||||||
# le compte existait déjà sous ce code à t-1, dormant ou partiel).
|
|
||||||
available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
|
available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
|
||||||
|
|
||||||
best_candidate = None
|
best_candidate = None
|
||||||
best_score_after = score_prev_no_surgery # baseline = pas de chirurgie
|
best_score_after = score_prev_no_surgery # baseline = pas de chirurgie
|
||||||
best_composite = 0.0
|
best_composite = 0.0
|
||||||
|
best_lookback = 0 # nombre de mois remontés pour trouver ce candidat
|
||||||
|
|
||||||
|
# ── Fenêtre de recherche étendue : jusqu'à MAX_SURGERY_LOOKBACK mois ──
|
||||||
|
# On cherche d'abord à t-1 (k=1), puis t-2 … t-MAX si rien trouvé.
|
||||||
|
# La confiance décroît avec la distance : confidence(k) = 1 - (k-1)/MAX
|
||||||
|
for k in range(1, MAX_SURGERY_LOOKBACK + 1):
|
||||||
|
if i - (k - 1) < 0:
|
||||||
|
break # on a atteint le début de l'historique
|
||||||
|
t_lookup = all_months[i - (k - 1)] # date candidate = t_prev - (k-1)
|
||||||
|
confidence = 1.0 - (k - 1) / MAX_SURGERY_LOOKBACK
|
||||||
|
|
||||||
for j in available:
|
for j in available:
|
||||||
# Pré-filtre rapide : overlap ISIN minimal
|
# Pré-filtre rapide : overlap ISIN minimal
|
||||||
isin_j = reg_isin_at_date.get(j, {}).get(t_prev, set())
|
isin_j = reg_isin_at_date.get(j, {}).get(t_lookup, set())
|
||||||
if not isin_curr or not isin_j:
|
if not isin_curr or not isin_j:
|
||||||
continue
|
continue
|
||||||
inter = len(isin_curr & isin_j)
|
inter = len(isin_curr & isin_j)
|
||||||
|
|
@ -610,18 +692,27 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
|
||||||
if jac < MIN_JACCARD:
|
if jac < MIN_JACCARD:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Score après chirurgie avec ce candidat
|
# Score après chirurgie avec ce candidat à t_lookup
|
||||||
score_after = _recompute_score_with_candidate(
|
# (on utilise t_curr comme référence de stock, t_lookup comme prior)
|
||||||
reg_curr, j, t_prev, t_curr, panel, flows_idx, score_curr
|
score_after_raw = _recompute_score_with_candidate(
|
||||||
|
reg_curr, j, t_lookup, t_curr, panel, flows_idx, score_curr
|
||||||
)
|
)
|
||||||
composite = jac * (score_after / score_curr) if score_curr > 0 else 0
|
# Appliquer le facteur de confiance lié à la distance temporelle
|
||||||
|
score_after = score_curr * confidence * (score_after_raw / score_curr) if score_curr > 0 else score_after_raw
|
||||||
|
composite = jac * confidence * (score_after_raw / score_curr) if score_curr > 0 else 0
|
||||||
|
|
||||||
if score_after > best_score_after:
|
if score_after > best_score_after:
|
||||||
best_score_after = score_after
|
best_score_after = score_after
|
||||||
best_candidate = j
|
best_candidate = j
|
||||||
best_composite = composite
|
best_composite = composite
|
||||||
|
best_lookback = k
|
||||||
|
|
||||||
|
# Si on a trouvé un bon candidat à cette distance, on s'arrête
|
||||||
|
if best_candidate is not None:
|
||||||
|
break
|
||||||
|
|
||||||
if best_candidate:
|
if best_candidate:
|
||||||
|
lookback_note = f", lookback={best_lookback}m" if best_lookback > 1 else ""
|
||||||
surgery_log.append({
|
surgery_log.append({
|
||||||
'date': t_prev,
|
'date': t_prev,
|
||||||
'reg_orig': reg_orig,
|
'reg_orig': reg_orig,
|
||||||
|
|
@ -632,15 +723,15 @@ def run_surgery_pass(scores_history, errors_history, panel, monthly_flows,
|
||||||
'score_after': round(best_score_after, 6),
|
'score_after': round(best_score_after, 6),
|
||||||
'drop_without_surgery': round(drop_ratio, 4),
|
'drop_without_surgery': round(drop_ratio, 4),
|
||||||
'gain_vs_no_surgery': round(best_score_after - score_prev_no_surgery, 6),
|
'gain_vs_no_surgery': round(best_score_after - score_prev_no_surgery, 6),
|
||||||
|
'lookback_months': best_lookback,
|
||||||
})
|
})
|
||||||
print(f" 🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
|
print(f" 🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
|
||||||
f"{reg_curr} → {best_candidate} "
|
f"{reg_curr} → {best_candidate} "
|
||||||
f"(composite={best_composite:.3f}, "
|
f"(composite={best_composite:.3f}, "
|
||||||
f"score {score_curr:.4f}→{best_score_after:.4f})")
|
f"score {score_curr:.4f}→{best_score_after:.4f}"
|
||||||
|
f"{lookback_note})")
|
||||||
|
|
||||||
# Mise à jour mapping
|
# Mise à jour mapping
|
||||||
# Si self-mapping (best_candidate == reg_curr), on ne touche pas
|
|
||||||
# mapping_inv car le code ne change pas — on met juste à jour le score.
|
|
||||||
if best_candidate != reg_curr:
|
if best_candidate != reg_curr:
|
||||||
if reg_curr in mapping_inv:
|
if reg_curr in mapping_inv:
|
||||||
del mapping_inv[reg_curr]
|
del mapping_inv[reg_curr]
|
||||||
|
|
@ -681,7 +772,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
|
||||||
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
|
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
|
||||||
if not df_scores.empty:
|
if not df_scores.empty:
|
||||||
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
|
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
|
||||||
df_scores.to_csv(f"repair_results/{out_prefix}_scores.csv", index=False)
|
df_scores.to_csv(f"/mnt/user-data/outputs/{out_prefix}_scores.csv", index=False)
|
||||||
|
|
||||||
# Mapping history
|
# Mapping history
|
||||||
rows_m = []
|
rows_m = []
|
||||||
|
|
@ -710,13 +801,13 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# 8. PIPELINE PRINCIPAL
|
# 8. PIPELINE PRINCIPAL
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def run_pipeline(broken_months_path=None):
|
def run_pipeline(aum_path, flows_path, broken_months_path=None):
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
|
print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
# Chargement
|
# Chargement
|
||||||
aum, flows = load_data()
|
aum, flows = load_data(aum_path, flows_path)
|
||||||
|
|
||||||
# Broken months (optional — produced by carmignac_diagnostics.py)
|
# Broken months (optional — produced by carmignac_diagnostics.py)
|
||||||
broken_months, lag_months = load_broken_months(broken_months_path)
|
broken_months, lag_months = load_broken_months(broken_months_path)
|
||||||
|
|
|
||||||
44
repair_challenge/push_s3.ipynb
Normal file
44
repair_challenge/push_s3.ipynb
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5c8fc6c5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import s3fs\n",
|
||||||
|
"\n",
|
||||||
|
"def push_file(local_path, s3_path):\n",
|
||||||
|
" fs = s3fs.S3FileSystem(\n",
|
||||||
|
" client_kwargs={'endpoint_url': 'https://' + 'minio-simple.lab.groupe-genes.fr'},\n",
|
||||||
|
" key=os.environ[\"AWS_ACCESS_KEY_ID\"],\n",
|
||||||
|
" secret=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n",
|
||||||
|
" token=os.environ[\"AWS_SESSION_TOKEN\"]\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" with open(local_path, 'rb') as local_f, fs.open(s3_path, 'wb') as s3_f:\n",
|
||||||
|
" s3_f.write(local_f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d43b725e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')\n",
|
||||||
|
"push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user