""" Carmignac Data Challenge — Pipeline Results Analysis ===================================================== Analyses the CSV outputs produced by carmignac_repair.py: - carmignac_scores.csv (post-surgery score history) - carmignac_mapping.csv (reg_id mapping history) - carmignac_surgery_log.csv (surgery operations) Produces a self-contained HTML report with interactive charts. Usage: python carmignac_analysis.py python carmignac_analysis.py --scores path/to/scores.csv \ --mapping path/to/mapping.csv \ --surgery path/to/surgery_log.csv \ --out report.html """ import argparse import json import os import sys import numpy as np import pandas as pd # ───────────────────────────────────────────────────────────── # 1. LOAD & VALIDATE # ───────────────────────────────────────────────────────────── def load_outputs(scores_path, mapping_path, surgery_path, err_isin_path=None, err_agg_path=None): scores = pd.read_csv(scores_path, parse_dates=["date"]) mapping = pd.read_csv(mapping_path, parse_dates=["date"]) surgery = pd.read_csv(surgery_path, parse_dates=["date"]) # Normalise dtypes scores["reg_id"] = scores["reg_id"].astype(str) mapping["reg_orig"] = mapping["reg_orig"].astype(str) mapping["reg_used"] = mapping["reg_used"].astype(str) mapping["changed"] = mapping["changed"].astype(bool) surgery["reg_orig"] = surgery["reg_orig"].astype(str) surgery["reg_from"] = surgery["reg_from"].astype(str) surgery["reg_to"] = surgery["reg_to"].astype(str) if "lookback_months" not in surgery.columns: surgery["lookback_months"] = 1 # backwards compat # Error account (optional) err_isin = None err_agg = None if err_isin_path and os.path.exists(err_isin_path): err_isin = pd.read_csv(err_isin_path, parse_dates=["date"]) err_isin["isin"] = err_isin["isin"].astype(str) if err_agg_path and os.path.exists(err_agg_path): err_agg = pd.read_csv(err_agg_path, parse_dates=["date"]) return scores, mapping, surgery, err_isin, err_agg # ───────────────────────────────────────────────────────────── # 1b. LOAD ERROR ACCOUNT (optional) # ───────────────────────────────────────────────────────────── def load_error_account(isin_path, agg_path): """ Loads the error account CSVs produced by carmignac_diagnostics.py. Returns (df_err_isin, df_err_agg) or (None, None) if files not found. """ if not isin_path or not agg_path: return None, None try: ei = pd.read_csv(isin_path, parse_dates=["date"]) ea = pd.read_csv(agg_path, parse_dates=["date"]) ei["isin"] = ei["isin"].astype(str) print(f"[Load] error account (ISIN) : {len(ei)} rows, " f"{ei['isin'].nunique()} ISINs") print(f"[Load] error account (agg) : {len(ea)} rows") return ei, ea except Exception as e: print(f"[WARN] Could not load error account: {e}") return None, None # ───────────────────────────────────────────────────────────── # 2. COMPUTE ANALYTICS # ───────────────────────────────────────────────────────────── def compute_analytics(scores, mapping, surgery): dates = sorted(scores["date"].unique()) # ── 2.1 Sum of scores per date (post-surgery) ────────────── sum_post = (scores.groupby("date")["score"] .sum() .reindex(dates) .rename("sum_post")) # ── 2.2 Reconstruct pre-surgery (counterfactual) ─────────── # Without surgery, every reg_id that had a hard break would score 0 # from that date backwards. We propagate the surgery "gain" as a # cumulative deficit going back in time. gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum() # cumulative deficit = sum of gains for all surgeries at or after date t cumulative_deficit = pd.Series(0.0, index=dates) for d in dates: cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum() sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre") timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre}) timeline.index = pd.to_datetime(timeline.index) timeline["recovery_pct"] = np.where( sum_pre < sum_post, (sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100, 0.0, ) # ── 2.3 Per-date surgery stats ───────────────────────────── surgery_stats = ( surgery.groupby("date") .agg( n_surgeries = ("reg_orig", "count"), total_gain = ("gain_vs_no_surgery", "sum"), avg_gain = ("gain_vs_no_surgery", "mean"), avg_jaccard = ("jaccard_composite", "mean"), avg_score_before = ("score_before", "mean"), avg_score_after = ("score_after", "mean"), ) .reindex(dates, fill_value=0) ) # ── 2.4 Score distribution over time ─────────────────────── # Wide format: rows=dates, cols=reg_ids pivot = scores.pivot_table(index="date", columns="reg_id", values="score", aggfunc="last") pivot = pivot.reindex(dates) pivot.index = pd.to_datetime(pivot.index) # ── 2.5 Mapping churn ────────────────────────────────────── # For each date, how many reg_ids are remapped (not using their original code)? churn = (mapping.groupby("date")["changed"] .sum() .reindex(dates, fill_value=0) .rename("n_remapped")) # ── 2.6 Score entropy (distribution spread) ──────────────── def entropy(row): p = row.dropna() p = p[p > 0] if len(p) == 0: return np.nan p = p / p.sum() return -(p * np.log(p)).sum() timeline["entropy"] = pivot.apply(entropy, axis=1).values # ── 2.7 Individual score trajectories ────────────────────── # Identify which reg_ids were ever remapped ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique()) # ── 2.8 Surgery detail table ─────────────────────────────── surgery_detail = surgery.copy() surgery_detail["gain_pct_of_score"] = ( surgery_detail["gain_vs_no_surgery"] / surgery_detail["score_before"].clip(lower=1e-9) * 100 ).round(2) return { "timeline": timeline, "surgery_stats": surgery_stats, "pivot": pivot, "churn": churn, "ever_remapped": ever_remapped, "surgery_detail": surgery_detail, "dates": [d.strftime("%Y-%m-%d") for d in dates], } # ───────────────────────────────────────────────────────────── # 3. PRINT CONSOLE SUMMARY # ───────────────────────────────────────────────────────────── def print_summary(analytics, surgery): tl = analytics["timeline"] ss = analytics["surgery_stats"] print("\n" + "=" * 65) print(" CARMIGNAC PIPELINE — RESULTS SUMMARY") print("=" * 65) print(f"\n Date range : {tl.index.min().date()} → {tl.index.max().date()}") print(f" Total months : {len(tl)}") print(f" Reg IDs : {analytics['pivot'].shape[1]}") print(f"\n ── Score (Σ) ──────────────────────────────────────────") print(f" At t_ref (latest) : {tl['sum_post'].iloc[-1]:.6f}") print(f" At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}") print(f" Min (post-surgery) : {tl['sum_post'].min():.6f} " f"({tl['sum_post'].idxmin().date()})") print(f" Min (pre-surgery) : {tl['sum_pre'].min():.6f} " f"({tl['sum_pre'].idxmin().date()})") print(f" Max recovery (pct) : {tl['recovery_pct'].max():.2f}%") print(f"\n ── Surgeries ─────────────────────────────────────────") if len(surgery) == 0: print(" No surgeries performed.") else: print(f" Total operations : {len(surgery)}") print(f" Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}") print(f" Avg Jaccard : {surgery['jaccard_composite'].mean():.4f}") print(f" Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}") print() print(f" {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} " f"{'Jaccard':>8s} {'Gain':>10s}") print(" " + "-" * 78) for _, row in surgery.sort_values("date").iterrows(): print(f" {str(row['date'].date()):12s} {row['reg_orig']:12s} " f"{row['reg_from']:15s} {row['reg_to']:15s} " f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}") print(f"\n ── Mapping churn ─────────────────────────────────────") ch = analytics["churn"] print(f" Max remapped at one date : {int(ch.max())} ({ch.idxmax().date() if ch.max()>0 else 'N/A'})") print(f" Reg IDs ever remapped : {len(analytics['ever_remapped'])}") print(f"\n ── Score entropy (distribution spread) ───────────────") ent = analytics["timeline"]["entropy"] print(f" Mean entropy : {ent.mean():.4f}") print(f" Std entropy : {ent.std():.4f}") print() # ───────────────────────────────────────────────────────────── # 4. BUILD HTML REPORT # ───────────────────────────────────────────────────────────── def build_html(analytics, surgery, scores, mapping, df_err_isin=None, df_err_agg=None): tl = analytics["timeline"] ss = analytics["surgery_stats"] piv = analytics["pivot"] ch = analytics["churn"] dates_str = analytics["dates"] # ── helpers to serialise for JS ───────────────────────────── def jf(arr, decimals=6): return json.dumps([round(float(v), decimals) if not np.isnan(v) else None for v in arr]) def js(arr): return json.dumps(list(arr)) # ── colour palette ─────────────────────────────────────────── REG_COLORS = [ "#2563eb","#16a34a","#dc2626","#d97706","#7c3aed", "#0891b2","#db2777","#65a30d","#ea580c","#6366f1", "#059669","#b45309","#9333ea","#0284c7","#e11d48", ] # ── 4.1 Surgery sparkline data ────────────────────────────── surg_dates = [d.strftime("%Y-%m-%d") for d in ss.index] n_surg = jf(ss["n_surgeries"].values, 0) total_gain = jf(ss["total_gain"].values) avg_gain = jf(ss["avg_gain"].values) avg_jaccard = jf(ss["avg_jaccard"].values) # ── 4.2 Individual trajectories ──────────────────────────── reg_ids = list(piv.columns) traj_datasets = [] # Surgery lookup: reg_orig -> list of {date, from, to, composite} surg_by_reg = {} for _, row in surgery.iterrows(): surg_by_reg.setdefault(row["reg_orig"], []).append({ "date": row["date"].strftime("%Y-%m-%d"), "reg_from": str(row["reg_from"]), "reg_to": str(row["reg_to"]), "composite": round(float(row["jaccard_composite"]), 4), "gain": round(float(row["gain_vs_no_surgery"]), 6), }) for idx, rid in enumerate(reg_ids): remapped = rid in analytics["ever_remapped"] traj_datasets.append({ "label": rid, "data": [round(float(v), 6) if not np.isnan(v) else None for v in piv[rid].values], "borderColor": REG_COLORS[idx % len(REG_COLORS)], "backgroundColor": REG_COLORS[idx % len(REG_COLORS)] + "22", "borderWidth": 2, "borderDash": [6, 3] if remapped else [], "pointRadius": 0, "tension": 0.3, "fill": False, "remapped": remapped, "surgeries": surg_by_reg.get(rid, []), }) traj_json = json.dumps(traj_datasets) # ── 4.2b Error account data (optional) ──────────────────── has_error = df_err_isin is not None and df_err_agg is not None if has_error: err_dates = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])] err_agg_stock = [round(float(v), 3) if not pd.isna(v) else None for v in df_err_agg["stock_error_agg"].values] err_agg_res = [round(float(v), 3) if not pd.isna(v) else None for v in df_err_agg["residual_agg"].values] err_agg_pct = [round(float(v), 4) if not pd.isna(v) else None for v in df_err_agg["stock_error_agg_pct"].values] # Top 5 ISINs by max |stock error| top_err = (df_err_isin.groupby("isin")["stock_error"] .apply(lambda x: x.abs().max()) .nlargest(5).index.tolist()) all_err_dates = sorted(df_err_isin["date"].unique()) ERR_COLORS = ["#ef4444","#f59e0b","#8b5cf6","#06b6d4","#10b981"] err_isin_ds = [] for idx, isin in enumerate(top_err): sub = (df_err_isin[df_err_isin["isin"] == isin] .set_index("date")["stock_error"] .reindex(all_err_dates)) err_isin_ds.append({ "label": isin, "data": [round(float(v), 3) if not pd.isna(v) else None for v in sub.values], "borderColor": ERR_COLORS[idx % len(ERR_COLORS)], "backgroundColor": ERR_COLORS[idx % len(ERR_COLORS)] + "22", "borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False, }) max_err_stock = float(df_err_agg["stock_error_agg"].abs().max()) max_err_pct = float(df_err_agg["stock_error_agg_pct"].max()) agg_std = float(df_err_agg["stock_error_agg"].std()) agg_mean = float(df_err_agg["stock_error_agg"].abs().mean()) stationarity = round(agg_std / max(agg_mean, 1e-9), 3) err_dates_js = json.dumps(err_dates) err_agg_stock_js = json.dumps(err_agg_stock) err_agg_res_js = json.dumps(err_agg_res) err_agg_pct_js = json.dumps(err_agg_pct) err_isin_ds_js = json.dumps(err_isin_ds) err_isin_dates_js = json.dumps([d.strftime("%Y-%m-%d") if hasattr(d, "strftime") else str(d)[:10] for d in all_err_dates]) # ISIN detail table (top 100 worst) err_rows = [] for _, r in (df_err_isin.assign(abs_s=df_err_isin["stock_error"].abs()) .sort_values("abs_s", ascending=False) .head(100).iterrows()): ds = r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10] sc = "color:var(--danger)" if r["stock_error"] < 0 else "color:var(--warn)" rc = "color:var(--danger)" if r["residual"] < 0 else "color:var(--warn)" pch = "color:var(--danger);font-weight:600" if r["stock_error_pct"] > 5 else ("color:var(--warn)" if r["stock_error_pct"] > 1 else "") err_rows.append( f'{ds}' f'{r["isin"]}' f'{r["residual"]:+,.2f}' f'{r["stock_error"]:+,.2f}' f'{r["stock_error_pct"]:.3f}%' f'' ) err_isin_detail = "".join(err_rows) if err_rows else ( '✓ Error account is flat' ) # HTML block for error account section err_section_html = f"""
06 · Error Account
Aggregate error account stock Stock_error(t_ref) = 0. The stock absorbs unreconciled residuals going backwards. A flat signal near zero = clean data. A drift = structural gap.
Max |error stock|
{max_err_stock:,.1f} shares
Max % of total AUM
{max_err_pct:.3f}%
Stationarity σ/μ
{stationarity:.3f}
lower = more stationary
Monthly aggregate residual ΔQ_total − F_total per month
Error stock — top 5 ISINs Cumulative error stock per ISIN
Error account detail — worst (ISIN, month) pairs
{err_isin_detail}
DateISIN Monthly residual Cumul. stock % of max AUM
""" # JS block for error account charts err_js_block = f""" // ── 8. Error account charts ────────────────────────────────── const ERR_DATES = {err_dates_js}; const ERR_AGG_STOCK = {err_agg_stock_js}; const ERR_AGG_RES = {err_agg_res_js}; const ERR_ISIN_TS = {err_isin_ds_js}; const ERR_ISIN_DATES = {err_isin_dates_js}; new Chart(document.getElementById('chartErrStock'), {{ type: 'line', data: {{ labels: ERR_DATES, datasets: [{{ label: 'Aggregate error stock', data: ERR_AGG_STOCK, borderColor: '#ef4444', backgroundColor: '#ef444415', borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true }}] }}, options: {{ responsive: true, maintainAspectRatio: false, interaction: {{mode:'index', intersect:false}}, plugins: {{ legend: {{display:false}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: {{ ...yAxis('Shares'), grid: {{ color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030', lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1 }} }} }} }} }}); new Chart(document.getElementById('chartErrRes'), {{ type: 'bar', data: {{ labels: ERR_DATES, datasets: [{{ label: 'Monthly residual', data: ERR_AGG_RES, backgroundColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef444488' : '#f59e0b88'), borderColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef4444' : '#f59e0b'), borderWidth: 1, borderRadius: 2 }}] }}, options: {{ responsive: true, maintainAspectRatio: false, plugins: {{ legend: {{display:false}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: yAxis('Shares') }} }} }}); new Chart(document.getElementById('chartErrIsin'), {{ type: 'line', data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }}, options: {{ responsive: true, maintainAspectRatio: false, interaction: {{mode:'index', intersect:false}}, plugins: {{ legend: {{position:'right', labels:{{boxWidth:10, padding:8, font:{{size:10}}}}}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: yAxis('Error stock (shares)') }} }} }});""" else: err_section_html = "" err_js_block = "" # ── 4.3 Surgery detail table rows ────────────────────────── sd = analytics["surgery_detail"].sort_values("date") surg_rows_html = "" if len(sd) == 0: surg_rows_html = "No surgeries performed" else: for _, r in sd.iterrows(): gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low" lb = int(r.get("lookback_months", 1)) lb_cell = (f'{lb}m' if lb > 1 else "—") surg_rows_html += f""" {r['date'].date()} {r['reg_orig']} {r['reg_from']} → {r['reg_to']} {r['jaccard_composite']:.4f} +{r['gain_vs_no_surgery']:.6f} {r['gain_pct_of_score']:.1f}% {lb_cell} """ # ── 4.4 Top accounts table ────────────────────────────────── last_date = piv.index.max() top_accounts = piv.loc[last_date].dropna().sort_values(ascending=False) top_rows_html = "" for rank, (rid, sc) in enumerate(top_accounts.items(), 1): remapped = "✓" if rid in analytics["ever_remapped"] else "" bar_w = int(sc / top_accounts.max() * 100) color = REG_COLORS[(rank - 1) % len(REG_COLORS)] top_rows_html += f""" #{rank} {rid} {sc:.6f}
{remapped} """ # ───────────────────────────────────────────────────────────── # HTML TEMPLATE # ───────────────────────────────────────────────────────────── html = f""" Carmignac Pipeline — Analysis Report
Carmignac × ENSAE · Data Challenge 2025

Pipeline Results — Analysis Report

Registrar ID repair · Score propagation · Surgery audit
Σ score at t_ref {tl['sum_post'].iloc[-1]:.4f} post-surgery
Σ score at t_min {tl['sum_post'].iloc[0]:.4f} post-surgery
Max recovery {tl['recovery_pct'].max():.1f}% score rescued by surgery
Total surgeries {len(surgery)} operations performed
Reg IDs universe {piv.shape[1]} at reference date
Ever remapped {len(analytics['ever_remapped'])} reg IDs w/ code change
01 · Score Integrity Over Time
Sum of scores — pre vs post surgery Post-surgery (solid) shows the corrected score after code repairs. Pre-surgery (dashed) is the counterfactual without any remapping. Gap = score rescued.
Score recovered by surgery Difference post − pre at each date
Portfolio concentration (entropy) Shannon entropy of score distribution — higher = more spread
02 · Individual Score Trajectories
Score explorer — per Registrar Account Click an account to inspect its full history. ◆ remapped = surgery was applied.
03 · Surgery Operations
Surgeries per time step Number of code remappings performed at each month
Score gain per surgery Average gain in Σ score from surgery at each month
Jaccard similarity of surgery matches Composite Jaccard score of the matched code pair — closer to 1.0 = stronger portfolio overlap. Low values may indicate uncertain matches.
04 · Surgery Detail Log
All surgery operations
{'
No surgeries were performed on this dataset.
' if len(surgery) == 0 else f""" {surg_rows_html}
Date Reg orig Code from Code to Jaccard Score gain % of score Lookback
"""}
05 · Score Ranking at t_ref
Accounts ranked by weight at reference date ✓ in last column = account was remapped at some point in history
{top_rows_html}
Rank Registrar ID Score (weight) Relative size Remapped
{err_section_html}
""" return html # ───────────────────────────────────────────────────────────── # 5. MAIN # ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser") parser.add_argument("--scores", default="repair_results/carmignac_scores.csv") parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv") parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv") parser.add_argument("--out", default="repair_results/carmignac_report.html") parser.add_argument("--error-account-isin", default=None, dest="error_isin", help="Path to carmignac_error_account.csv (optional)") parser.add_argument("--error-account-agg", default=None, dest="error_agg", help="Path to carmignac_error_account_agg.csv (optional)") args = parser.parse_args() # Resolve paths relative to this script's directory if files not found base = os.path.dirname(os.path.abspath(__file__)) def resolve(p, required=True): if p is None: return None if os.path.exists(p): return p alt = os.path.join(base, p) if os.path.exists(alt): return alt if required: sys.exit(f"[ERROR] File not found: {p}") print(f"[WARN] Optional file not found: {p}") return None scores_path = resolve(args.scores) mapping_path = resolve(args.mapping) surgery_path = resolve(args.surgery) error_isin_path = resolve(args.error_isin, required=False) error_agg_path = resolve(args.error_agg, required=False) print(f"[Load] scores : {scores_path}") print(f"[Load] mapping : {mapping_path}") print(f"[Load] surgery : {surgery_path}") scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs( scores_path, mapping_path, surgery_path, err_isin_path=error_isin_path, err_agg_path=error_agg_path ) analytics = compute_analytics(scores, mapping, surgery) print_summary(analytics, surgery) html = build_html(analytics, surgery, scores, mapping, df_err_isin=df_err_isin, df_err_agg=df_err_agg) out_path = args.out with open(out_path, "w", encoding="utf-8") as f: f.write(html) print(f"\n[Report] Written to → {out_path}") if __name__ == "__main__": main()