"""
Carmignac Data Challenge — Pipeline Results Analysis
=====================================================
Analyses the CSV outputs produced by carmignac_repair.py:
- carmignac_scores.csv (post-surgery score history)
- carmignac_mapping.csv (reg_id mapping history)
- carmignac_surgery_log.csv (surgery operations)
Produces a self-contained HTML report with interactive charts.
Usage:
python carmignac_analysis.py
python carmignac_analysis.py --scores path/to/scores.csv \
--mapping path/to/mapping.csv \
--surgery path/to/surgery_log.csv \
--out report.html
"""
import argparse
import json
import os
import sys
import numpy as np
import pandas as pd
# ─────────────────────────────────────────────────────────────
# 1. LOAD & VALIDATE
# ─────────────────────────────────────────────────────────────
def load_outputs(scores_path, mapping_path, surgery_path,
err_isin_path=None, err_agg_path=None):
scores = pd.read_csv(scores_path, parse_dates=["date"])
mapping = pd.read_csv(mapping_path, parse_dates=["date"])
surgery = pd.read_csv(surgery_path, parse_dates=["date"])
# Normalise dtypes
scores["reg_id"] = scores["reg_id"].astype(str)
mapping["reg_orig"] = mapping["reg_orig"].astype(str)
mapping["reg_used"] = mapping["reg_used"].astype(str)
mapping["changed"] = mapping["changed"].astype(bool)
surgery["reg_orig"] = surgery["reg_orig"].astype(str)
surgery["reg_from"] = surgery["reg_from"].astype(str)
surgery["reg_to"] = surgery["reg_to"].astype(str)
if "lookback_months" not in surgery.columns:
surgery["lookback_months"] = 1 # backwards compat
# Error account (optional)
err_isin = None
err_agg = None
if err_isin_path and os.path.exists(err_isin_path):
err_isin = pd.read_csv(err_isin_path, parse_dates=["date"])
err_isin["isin"] = err_isin["isin"].astype(str)
if err_agg_path and os.path.exists(err_agg_path):
err_agg = pd.read_csv(err_agg_path, parse_dates=["date"])
return scores, mapping, surgery, err_isin, err_agg
# ─────────────────────────────────────────────────────────────
# 1b. LOAD ERROR ACCOUNT (optional)
# ─────────────────────────────────────────────────────────────
def load_error_account(isin_path, agg_path):
"""
Loads the error account CSVs produced by carmignac_diagnostics.py.
Returns (df_err_isin, df_err_agg) or (None, None) if files not found.
"""
if not isin_path or not agg_path:
return None, None
try:
ei = pd.read_csv(isin_path, parse_dates=["date"])
ea = pd.read_csv(agg_path, parse_dates=["date"])
ei["isin"] = ei["isin"].astype(str)
print(f"[Load] error account (ISIN) : {len(ei)} rows, "
f"{ei['isin'].nunique()} ISINs")
print(f"[Load] error account (agg) : {len(ea)} rows")
return ei, ea
except Exception as e:
print(f"[WARN] Could not load error account: {e}")
return None, None
# ─────────────────────────────────────────────────────────────
# 2. COMPUTE ANALYTICS
# ─────────────────────────────────────────────────────────────
def compute_analytics(scores, mapping, surgery):
dates = sorted(scores["date"].unique())
# ── 2.1 Sum of scores per date (post-surgery) ──────────────
sum_post = (scores.groupby("date")["score"]
.sum()
.reindex(dates)
.rename("sum_post"))
# ── 2.2 Reconstruct pre-surgery (counterfactual) ───────────
# Without surgery, every reg_id that had a hard break would score 0
# from that date backwards. We propagate the surgery "gain" as a
# cumulative deficit going back in time.
gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum()
# cumulative deficit = sum of gains for all surgeries at or after date t
cumulative_deficit = pd.Series(0.0, index=dates)
for d in dates:
cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum()
sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre")
timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre})
timeline.index = pd.to_datetime(timeline.index)
timeline["recovery_pct"] = np.where(
sum_pre < sum_post,
(sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100,
0.0,
)
# ── 2.3 Per-date surgery stats ─────────────────────────────
surgery_stats = (
surgery.groupby("date")
.agg(
n_surgeries = ("reg_orig", "count"),
total_gain = ("gain_vs_no_surgery", "sum"),
avg_gain = ("gain_vs_no_surgery", "mean"),
avg_jaccard = ("jaccard_composite", "mean"),
avg_score_before = ("score_before", "mean"),
avg_score_after = ("score_after", "mean"),
)
.reindex(dates, fill_value=0)
)
# ── 2.4 Score distribution over time ───────────────────────
# Wide format: rows=dates, cols=reg_ids
pivot = scores.pivot_table(index="date", columns="reg_id",
values="score", aggfunc="last")
pivot = pivot.reindex(dates)
pivot.index = pd.to_datetime(pivot.index)
# ── 2.5 Mapping churn ──────────────────────────────────────
# For each date, how many reg_ids are remapped (not using their original code)?
churn = (mapping.groupby("date")["changed"]
.sum()
.reindex(dates, fill_value=0)
.rename("n_remapped"))
# ── 2.6 Score entropy (distribution spread) ────────────────
def entropy(row):
p = row.dropna()
p = p[p > 0]
if len(p) == 0:
return np.nan
p = p / p.sum()
return -(p * np.log(p)).sum()
timeline["entropy"] = pivot.apply(entropy, axis=1).values
# ── 2.7 Individual score trajectories ──────────────────────
# Identify which reg_ids were ever remapped
ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique())
# ── 2.8 Surgery detail table ───────────────────────────────
surgery_detail = surgery.copy()
surgery_detail["gain_pct_of_score"] = (
surgery_detail["gain_vs_no_surgery"]
/ surgery_detail["score_before"].clip(lower=1e-9) * 100
).round(2)
return {
"timeline": timeline,
"surgery_stats": surgery_stats,
"pivot": pivot,
"churn": churn,
"ever_remapped": ever_remapped,
"surgery_detail": surgery_detail,
"dates": [d.strftime("%Y-%m-%d") for d in dates],
}
# ─────────────────────────────────────────────────────────────
# 3. PRINT CONSOLE SUMMARY
# ─────────────────────────────────────────────────────────────
def print_summary(analytics, surgery):
tl = analytics["timeline"]
ss = analytics["surgery_stats"]
print("\n" + "=" * 65)
print(" CARMIGNAC PIPELINE — RESULTS SUMMARY")
print("=" * 65)
print(f"\n Date range : {tl.index.min().date()} → {tl.index.max().date()}")
print(f" Total months : {len(tl)}")
print(f" Reg IDs : {analytics['pivot'].shape[1]}")
print(f"\n ── Score (Σ) ──────────────────────────────────────────")
print(f" At t_ref (latest) : {tl['sum_post'].iloc[-1]:.6f}")
print(f" At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}")
print(f" Min (post-surgery) : {tl['sum_post'].min():.6f} "
f"({tl['sum_post'].idxmin().date()})")
print(f" Min (pre-surgery) : {tl['sum_pre'].min():.6f} "
f"({tl['sum_pre'].idxmin().date()})")
print(f" Max recovery (pct) : {tl['recovery_pct'].max():.2f}%")
print(f"\n ── Surgeries ─────────────────────────────────────────")
if len(surgery) == 0:
print(" No surgeries performed.")
else:
print(f" Total operations : {len(surgery)}")
print(f" Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}")
print(f" Avg Jaccard : {surgery['jaccard_composite'].mean():.4f}")
print(f" Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}")
print()
print(f" {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} "
f"{'Jaccard':>8s} {'Gain':>10s}")
print(" " + "-" * 78)
for _, row in surgery.sort_values("date").iterrows():
print(f" {str(row['date'].date()):12s} {row['reg_orig']:12s} "
f"{row['reg_from']:15s} {row['reg_to']:15s} "
f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}")
print(f"\n ── Mapping churn ─────────────────────────────────────")
ch = analytics["churn"]
print(f" Max remapped at one date : {int(ch.max())} ({ch.idxmax().date() if ch.max()>0 else 'N/A'})")
print(f" Reg IDs ever remapped : {len(analytics['ever_remapped'])}")
print(f"\n ── Score entropy (distribution spread) ───────────────")
ent = analytics["timeline"]["entropy"]
print(f" Mean entropy : {ent.mean():.4f}")
print(f" Std entropy : {ent.std():.4f}")
print()
# ─────────────────────────────────────────────────────────────
# 4. BUILD HTML REPORT
# ─────────────────────────────────────────────────────────────
def build_html(analytics, surgery, scores, mapping, df_err_isin=None, df_err_agg=None):
tl = analytics["timeline"]
ss = analytics["surgery_stats"]
piv = analytics["pivot"]
ch = analytics["churn"]
dates_str = analytics["dates"]
# ── helpers to serialise for JS ─────────────────────────────
def jf(arr, decimals=6):
return json.dumps([round(float(v), decimals) if not np.isnan(v) else None
for v in arr])
def js(arr):
return json.dumps(list(arr))
# ── colour palette ───────────────────────────────────────────
REG_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
"#059669","#b45309","#9333ea","#0284c7","#e11d48",
]
# ── 4.1 Surgery sparkline data ──────────────────────────────
surg_dates = [d.strftime("%Y-%m-%d") for d in ss.index]
n_surg = jf(ss["n_surgeries"].values, 0)
total_gain = jf(ss["total_gain"].values)
avg_gain = jf(ss["avg_gain"].values)
avg_jaccard = jf(ss["avg_jaccard"].values)
# ── 4.2 Individual trajectories ────────────────────────────
reg_ids = list(piv.columns)
traj_datasets = []
# Surgery lookup: reg_orig -> list of {date, from, to, composite}
surg_by_reg = {}
for _, row in surgery.iterrows():
surg_by_reg.setdefault(row["reg_orig"], []).append({
"date": row["date"].strftime("%Y-%m-%d"),
"reg_from": str(row["reg_from"]),
"reg_to": str(row["reg_to"]),
"composite": round(float(row["jaccard_composite"]), 4),
"gain": round(float(row["gain_vs_no_surgery"]), 6),
})
for idx, rid in enumerate(reg_ids):
remapped = rid in analytics["ever_remapped"]
traj_datasets.append({
"label": rid,
"data": [round(float(v), 6) if not np.isnan(v) else None
for v in piv[rid].values],
"borderColor": REG_COLORS[idx % len(REG_COLORS)],
"backgroundColor": REG_COLORS[idx % len(REG_COLORS)] + "22",
"borderWidth": 2,
"borderDash": [6, 3] if remapped else [],
"pointRadius": 0,
"tension": 0.3,
"fill": False,
"remapped": remapped,
"surgeries": surg_by_reg.get(rid, []),
})
traj_json = json.dumps(traj_datasets)
# ── 4.2b Error account data (optional) ────────────────────
has_error = df_err_isin is not None and df_err_agg is not None
if has_error:
err_dates = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])]
err_agg_stock = [round(float(v), 3) if not pd.isna(v) else None
for v in df_err_agg["stock_error_agg"].values]
err_agg_res = [round(float(v), 3) if not pd.isna(v) else None
for v in df_err_agg["residual_agg"].values]
err_agg_pct = [round(float(v), 4) if not pd.isna(v) else None
for v in df_err_agg["stock_error_agg_pct"].values]
# Top 5 ISINs by max |stock error|
top_err = (df_err_isin.groupby("isin")["stock_error"]
.apply(lambda x: x.abs().max())
.nlargest(5).index.tolist())
all_err_dates = sorted(df_err_isin["date"].unique())
ERR_COLORS = ["#ef4444","#f59e0b","#8b5cf6","#06b6d4","#10b981"]
err_isin_ds = []
for idx, isin in enumerate(top_err):
sub = (df_err_isin[df_err_isin["isin"] == isin]
.set_index("date")["stock_error"]
.reindex(all_err_dates))
err_isin_ds.append({
"label": isin,
"data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values],
"borderColor": ERR_COLORS[idx % len(ERR_COLORS)],
"backgroundColor": ERR_COLORS[idx % len(ERR_COLORS)] + "22",
"borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False,
})
max_err_stock = float(df_err_agg["stock_error_agg"].abs().max())
max_err_pct = float(df_err_agg["stock_error_agg_pct"].max())
agg_std = float(df_err_agg["stock_error_agg"].std())
agg_mean = float(df_err_agg["stock_error_agg"].abs().mean())
stationarity = round(agg_std / max(agg_mean, 1e-9), 3)
err_dates_js = json.dumps(err_dates)
err_agg_stock_js = json.dumps(err_agg_stock)
err_agg_res_js = json.dumps(err_agg_res)
err_agg_pct_js = json.dumps(err_agg_pct)
err_isin_ds_js = json.dumps(err_isin_ds)
err_isin_dates_js = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime")
else str(d)[:10] for d in all_err_dates])
# ISIN detail table (top 100 worst)
err_rows = []
for _, r in (df_err_isin.assign(abs_s=df_err_isin["stock_error"].abs())
.sort_values("abs_s", ascending=False)
.head(100).iterrows()):
ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]
sc = "color:var(--danger)" if r["stock_error"] < 0 else "color:var(--warn)"
rc = "color:var(--danger)" if r["residual"] < 0 else "color:var(--warn)"
pch = "color:var(--danger);font-weight:600" if r["stock_error_pct"] > 5 else ("color:var(--warn)" if r["stock_error_pct"] > 1 else "")
err_rows.append(
f'
| {ds} | '
f'{r["isin"]} | '
f'{r["residual"]:+,.2f} | '
f'{r["stock_error"]:+,.2f} | '
f'{r["stock_error_pct"]:.3f}% | '
f'
'
)
err_isin_detail = "".join(err_rows) if err_rows else (
'| ✓ Error account is flat |
'
)
# HTML block for error account section
err_section_html = f"""
06 · Error Account
Max |error stock|
{max_err_stock:,.1f} shares
Max % of total AUM
{max_err_pct:.3f}%
Stationarity σ/μ
{stationarity:.3f}
lower = more stationary
| Date | ISIN |
Monthly residual |
Cumul. stock |
% of max AUM |
{err_isin_detail}
"""
# JS block for error account charts
err_js_block = f"""
// ── 8. Error account charts ──────────────────────────────────
const ERR_DATES = {err_dates_js};
const ERR_AGG_STOCK = {err_agg_stock_js};
const ERR_AGG_RES = {err_agg_res_js};
const ERR_ISIN_TS = {err_isin_ds_js};
const ERR_ISIN_DATES = {err_isin_dates_js};
new Chart(document.getElementById('chartErrStock'), {{
type: 'line',
data: {{ labels: ERR_DATES, datasets: [{{
label: 'Aggregate error stock', data: ERR_AGG_STOCK,
borderColor: '#ef4444', backgroundColor: '#ef444415',
borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true
}}] }},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
scales: {{ x: timeAxis(), y: {{
...yAxis('Shares'),
grid: {{ color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030',
lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1 }}
}} }}
}}
}});
new Chart(document.getElementById('chartErrRes'), {{
type: 'bar',
data: {{ labels: ERR_DATES, datasets: [{{
label: 'Monthly residual', data: ERR_AGG_RES,
backgroundColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef444488' : '#f59e0b88'),
borderColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef4444' : '#f59e0b'),
borderWidth: 1, borderRadius: 2
}}] }},
options: {{
responsive: true, maintainAspectRatio: false,
plugins: {{ legend: {{display:false}}, tooltip: tooltip() }},
scales: {{ x: timeAxis(), y: yAxis('Shares') }}
}}
}});
new Chart(document.getElementById('chartErrIsin'), {{
type: 'line',
data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }},
options: {{
responsive: true, maintainAspectRatio: false,
interaction: {{mode:'index', intersect:false}},
plugins: {{
legend: {{position:'right', labels:{{boxWidth:10, padding:8, font:{{size:10}}}}}},
tooltip: tooltip()
}},
scales: {{ x: timeAxis(), y: yAxis('Error stock (shares)') }}
}}
}});"""
else:
err_section_html = ""
err_js_block = ""
# ── 4.3 Surgery detail table rows ──────────────────────────
sd = analytics["surgery_detail"].sort_values("date")
surg_rows_html = ""
if len(sd) == 0:
surg_rows_html = "| No surgeries performed |
"
else:
for _, r in sd.iterrows():
gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low"
lb = int(r.get("lookback_months", 1))
lb_cell = (f'{lb}m' if lb > 1 else "—")
surg_rows_html += f"""
| {r['date'].date()} |
{r['reg_orig']} |
{r['reg_from']} |
→ |
{r['reg_to']} |
{r['jaccard_composite']:.4f} |
+{r['gain_vs_no_surgery']:.6f} |
{r['gain_pct_of_score']:.1f}% |
{lb_cell} |
"""
# ── 4.4 Top accounts table ──────────────────────────────────
last_date = piv.index.max()
top_accounts = piv.loc[last_date].dropna().sort_values(ascending=False)
top_rows_html = ""
for rank, (rid, sc) in enumerate(top_accounts.items(), 1):
remapped = "✓" if rid in analytics["ever_remapped"] else ""
bar_w = int(sc / top_accounts.max() * 100)
color = REG_COLORS[(rank - 1) % len(REG_COLORS)]
top_rows_html += f"""
| #{rank} |
{rid} |
{sc:.6f} |
|
{remapped} |
"""
# ─────────────────────────────────────────────────────────────
# HTML TEMPLATE
# ─────────────────────────────────────────────────────────────
html = f"""
Carmignac Pipeline — Analysis Report
Σ score at t_ref
{tl['sum_post'].iloc[-1]:.4f}
post-surgery
Σ score at t_min
{tl['sum_post'].iloc[0]:.4f}
post-surgery
Max recovery
{tl['recovery_pct'].max():.1f}%
score rescued by surgery
Total surgeries
{len(surgery)}
operations performed
Reg IDs universe
{piv.shape[1]}
at reference date
Ever remapped
{len(analytics['ever_remapped'])}
reg IDs w/ code change
01 · Score Integrity Over Time
02 · Individual Score Trajectories
03 · Surgery Operations
04 · Surgery Detail Log
{'
No surgeries were performed on this dataset.
' if len(surgery) == 0 else f"""
| Date |
Reg orig |
Code from |
|
Code to |
Jaccard |
Score gain |
% of score |
Lookback |
{surg_rows_html}
"""}
05 · Score Ranking at t_ref
| Rank |
Registrar ID |
Score (weight) |
Relative size |
Remapped |
{top_rows_html}
{err_section_html}
"""
return html
# ─────────────────────────────────────────────────────────────
# 5. MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser")
parser.add_argument("--scores", default="repair_results/carmignac_scores.csv")
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
parser.add_argument("--out", default="repair_results/carmignac_report.html")
parser.add_argument("--error-account-isin", default=None,
dest="error_isin",
help="Path to carmignac_error_account.csv (optional)")
parser.add_argument("--error-account-agg", default=None,
dest="error_agg",
help="Path to carmignac_error_account_agg.csv (optional)")
args = parser.parse_args()
# Resolve paths relative to this script's directory if files not found
base = os.path.dirname(os.path.abspath(__file__))
def resolve(p, required=True):
if p is None:
return None
if os.path.exists(p):
return p
alt = os.path.join(base, p)
if os.path.exists(alt):
return alt
if required:
sys.exit(f"[ERROR] File not found: {p}")
print(f"[WARN] Optional file not found: {p}")
return None
scores_path = resolve(args.scores)
mapping_path = resolve(args.mapping)
surgery_path = resolve(args.surgery)
error_isin_path = resolve(args.error_isin, required=False)
error_agg_path = resolve(args.error_agg, required=False)
print(f"[Load] scores : {scores_path}")
print(f"[Load] mapping : {mapping_path}")
print(f"[Load] surgery : {surgery_path}")
scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs(
scores_path, mapping_path, surgery_path,
err_isin_path=error_isin_path, err_agg_path=error_agg_path
)
analytics = compute_analytics(scores, mapping, surgery)
print_summary(analytics, surgery)
html = build_html(analytics, surgery, scores, mapping,
df_err_isin=df_err_isin, df_err_agg=df_err_agg)
out_path = args.out
with open(out_path, "w", encoding="utf-8") as f:
f.write(html)
print(f"\n[Report] Written to → {out_path}")
if __name__ == "__main__":
main()