""" Carmignac Data Challenge — Pipeline Results Analysis ===================================================== Analyses the CSV outputs produced by carmignac_repair.py: - carmignac_scores.csv (post-surgery score history) - carmignac_mapping.csv (reg_id mapping history) - carmignac_surgery_log.csv (surgery operations) Produces a self-contained HTML report with interactive charts. Usage: python carmignac_analysis.py python carmignac_analysis.py --scores path/to/scores.csv \ --mapping path/to/mapping.csv \ --surgery path/to/surgery_log.csv \ --out report.html """ import argparse import json import os import sys import numpy as np import pandas as pd # ───────────────────────────────────────────────────────────── # 1. LOAD & VALIDATE # ───────────────────────────────────────────────────────────── def load_outputs(scores_path, mapping_path, surgery_path): scores = pd.read_csv(scores_path, parse_dates=["date"]) mapping = pd.read_csv(mapping_path, parse_dates=["date"]) surgery = pd.read_csv(surgery_path, parse_dates=["date"]) # Normalise dtypes scores["reg_id"] = scores["reg_id"].astype(str) mapping["reg_orig"] = mapping["reg_orig"].astype(str) mapping["reg_used"] = mapping["reg_used"].astype(str) mapping["changed"] = mapping["changed"].astype(bool) surgery["reg_orig"] = surgery["reg_orig"].astype(str) surgery["reg_from"] = surgery["reg_from"].astype(str) surgery["reg_to"] = surgery["reg_to"].astype(str) return scores, mapping, surgery # ───────────────────────────────────────────────────────────── # 2. COMPUTE ANALYTICS # ───────────────────────────────────────────────────────────── def compute_analytics(scores, mapping, surgery): dates = sorted(scores["date"].unique()) # ── 2.1 Sum of scores per date (post-surgery) ────────────── sum_post = (scores.groupby("date")["score"] .sum() .reindex(dates) .rename("sum_post")) # ── 2.2 Reconstruct pre-surgery (counterfactual) ─────────── # Without surgery, every reg_id that had a hard break would score 0 # from that date backwards. We propagate the surgery "gain" as a # cumulative deficit going back in time. gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum() # cumulative deficit = sum of gains for all surgeries at or after date t cumulative_deficit = pd.Series(0.0, index=dates) for d in dates: cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum() sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre") timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre}) timeline.index = pd.to_datetime(timeline.index) timeline["recovery_pct"] = np.where( sum_pre < sum_post, (sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100, 0.0, ) # ── 2.3 Per-date surgery stats ───────────────────────────── surgery_stats = ( surgery.groupby("date") .agg( n_surgeries = ("reg_orig", "count"), total_gain = ("gain_vs_no_surgery", "sum"), avg_gain = ("gain_vs_no_surgery", "mean"), avg_jaccard = ("jaccard_composite", "mean"), avg_score_before = ("score_before", "mean"), avg_score_after = ("score_after", "mean"), ) .reindex(dates, fill_value=0) ) # ── 2.4 Score distribution over time ─────────────────────── # Wide format: rows=dates, cols=reg_ids pivot = scores.pivot_table(index="date", columns="reg_id", values="score", aggfunc="last") pivot = pivot.reindex(dates) pivot.index = pd.to_datetime(pivot.index) # ── 2.5 Mapping churn ────────────────────────────────────── # For each date, how many reg_ids are remapped (not using their original code)? churn = (mapping.groupby("date")["changed"] .sum() .reindex(dates, fill_value=0) .rename("n_remapped")) # ── 2.6 Score entropy (distribution spread) ──────────────── def entropy(row): p = row.dropna() p = p[p > 0] if len(p) == 0: return np.nan p = p / p.sum() return -(p * np.log(p)).sum() timeline["entropy"] = pivot.apply(entropy, axis=1).values # ── 2.7 Individual score trajectories ────────────────────── # Identify which reg_ids were ever remapped ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique()) # ── 2.8 Surgery detail table ─────────────────────────────── surgery_detail = surgery.copy() surgery_detail["gain_pct_of_score"] = ( surgery_detail["gain_vs_no_surgery"] / surgery_detail["score_before"].clip(lower=1e-9) * 100 ).round(2) return { "timeline": timeline, "surgery_stats": surgery_stats, "pivot": pivot, "churn": churn, "ever_remapped": ever_remapped, "surgery_detail": surgery_detail, "dates": [d.strftime("%Y-%m-%d") for d in dates], } # ───────────────────────────────────────────────────────────── # 3. PRINT CONSOLE SUMMARY # ───────────────────────────────────────────────────────────── def print_summary(analytics, surgery): tl = analytics["timeline"] ss = analytics["surgery_stats"] print("\n" + "=" * 65) print(" CARMIGNAC PIPELINE — RESULTS SUMMARY") print("=" * 65) print(f"\n Date range : {tl.index.min().date()} → {tl.index.max().date()}") print(f" Total months : {len(tl)}") print(f" Reg IDs : {analytics['pivot'].shape[1]}") print(f"\n ── Score (Σ) ──────────────────────────────────────────") print(f" At t_ref (latest) : {tl['sum_post'].iloc[-1]:.6f}") print(f" At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}") print(f" Min (post-surgery) : {tl['sum_post'].min():.6f} " f"({tl['sum_post'].idxmin().date()})") print(f" Min (pre-surgery) : {tl['sum_pre'].min():.6f} " f"({tl['sum_pre'].idxmin().date()})") print(f" Max recovery (pct) : {tl['recovery_pct'].max():.2f}%") print(f"\n ── Surgeries ─────────────────────────────────────────") if len(surgery) == 0: print(" No surgeries performed.") else: print(f" Total operations : {len(surgery)}") print(f" Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}") print(f" Avg Jaccard : {surgery['jaccard_composite'].mean():.4f}") print(f" Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}") print() print(f" {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} " f"{'Jaccard':>8s} {'Gain':>10s}") print(" " + "-" * 78) for _, row in surgery.sort_values("date").iterrows(): print(f" {str(row['date'].date()):12s} {row['reg_orig']:12s} " f"{row['reg_from']:15s} {row['reg_to']:15s} " f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}") print(f"\n ── Mapping churn ─────────────────────────────────────") ch = analytics["churn"] print(f" Max remapped at one date : {int(ch.max())} ({ch.idxmax().date() if ch.max()>0 else 'N/A'})") print(f" Reg IDs ever remapped : {len(analytics['ever_remapped'])}") print(f"\n ── Score entropy (distribution spread) ───────────────") ent = analytics["timeline"]["entropy"] print(f" Mean entropy : {ent.mean():.4f}") print(f" Std entropy : {ent.std():.4f}") print() # ───────────────────────────────────────────────────────────── # 4. BUILD HTML REPORT # ───────────────────────────────────────────────────────────── def build_html(analytics, surgery, scores, mapping): tl = analytics["timeline"] ss = analytics["surgery_stats"] piv = analytics["pivot"] ch = analytics["churn"] dates_str = analytics["dates"] # ── helpers to serialise for JS ───────────────────────────── def jf(arr, decimals=6): return json.dumps([round(float(v), decimals) if not np.isnan(v) else None for v in arr]) def js(arr): return json.dumps(list(arr)) # ── colour palette ─────────────────────────────────────────── REG_COLORS = [ "#2563eb","#16a34a","#dc2626","#d97706","#7c3aed", "#0891b2","#db2777","#65a30d","#ea580c","#6366f1", "#059669","#b45309","#9333ea","#0284c7","#e11d48", ] # ── 4.1 Surgery sparkline data ────────────────────────────── surg_dates = [d.strftime("%Y-%m-%d") for d in ss.index] n_surg = jf(ss["n_surgeries"].values, 0) total_gain = jf(ss["total_gain"].values) avg_gain = jf(ss["avg_gain"].values) avg_jaccard = jf(ss["avg_jaccard"].values) # ── 4.2 Individual trajectories ──────────────────────────── reg_ids = list(piv.columns) traj_datasets = [] for idx, rid in enumerate(reg_ids): col = analytics["ever_remapped"] dashed = rid in col traj_datasets.append({ "label": rid, "data": [round(float(v), 6) if not np.isnan(v) else None for v in piv[rid].values], "borderColor": REG_COLORS[idx % len(REG_COLORS)], "backgroundColor": REG_COLORS[idx % len(REG_COLORS)] + "22", "borderWidth": 2 if not dashed else 2, "borderDash": [] if not dashed else [6, 3], "pointRadius": 0, "tension": 0.3, "fill": False, }) traj_json = json.dumps(traj_datasets) # ── 4.3 Surgery detail table rows ────────────────────────── sd = analytics["surgery_detail"].sort_values("date") surg_rows_html = "" if len(sd) == 0: surg_rows_html = "No surgeries performed" else: for _, r in sd.iterrows(): gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low" surg_rows_html += f""" {r['date'].date()} {r['reg_orig']} {r['reg_from']} → {r['reg_to']} {r['jaccard_composite']:.4f} +{r['gain_vs_no_surgery']:.6f} {r['gain_pct_of_score']:.1f}% """ # ── 4.4 Top accounts table ────────────────────────────────── last_date = piv.index.max() top_accounts = piv.loc[last_date].dropna().sort_values(ascending=False) top_rows_html = "" for rank, (rid, sc) in enumerate(top_accounts.items(), 1): remapped = "✓" if rid in analytics["ever_remapped"] else "" bar_w = int(sc / top_accounts.max() * 100) color = REG_COLORS[(rank - 1) % len(REG_COLORS)] top_rows_html += f""" #{rank} {rid} {sc:.6f}
{remapped} """ # ───────────────────────────────────────────────────────────── # HTML TEMPLATE # ───────────────────────────────────────────────────────────── html = f""" Carmignac Pipeline — Analysis Report
Carmignac × ENSAE · Data Challenge 2025

Pipeline Results — Analysis Report

Registrar ID repair · Score propagation · Surgery audit
Σ score at t_ref {tl['sum_post'].iloc[-1]:.4f} post-surgery
Σ score at t_min {tl['sum_post'].iloc[0]:.4f} post-surgery
Max recovery {tl['recovery_pct'].max():.1f}% score rescued by surgery
Total surgeries {len(surgery)} operations performed
Reg IDs universe {piv.shape[1]} at reference date
Ever remapped {len(analytics['ever_remapped'])} reg IDs w/ code change
01 · Score Integrity Over Time
Sum of scores — pre vs post surgery Post-surgery (solid) shows the corrected score after code repairs. Pre-surgery (dashed) is the counterfactual without any remapping. Gap = score rescued.
Score recovered by surgery Difference post − pre at each date
Portfolio concentration (entropy) Shannon entropy of score distribution — higher = more spread
02 · Individual Score Trajectories
Score per Registrar Account — full history Dashed lines = accounts that were remapped at some point (surgery applied). Solid lines = stable codes throughout.
03 · Surgery Operations
Surgeries per time step Number of code remappings performed at each month
Score gain per surgery Average gain in Σ score from surgery at each month
Jaccard similarity of surgery matches Composite Jaccard score of the matched code pair — closer to 1.0 = stronger portfolio overlap. Low values may indicate uncertain matches.
04 · Surgery Detail Log
All surgery operations
{'
No surgeries were performed on this dataset.
' if len(surgery) == 0 else f""" {surg_rows_html}
Date Reg orig Code from Code to Jaccard Score gain % of score
"""}
05 · Score Ranking at t_ref
Accounts ranked by weight at reference date ✓ in last column = account was remapped at some point in history
{top_rows_html}
Rank Registrar ID Score (weight) Relative size Remapped
""" return html # ───────────────────────────────────────────────────────────── # 5. MAIN # ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser") parser.add_argument("--scores", default="repair_results/carmignac_scores.csv") parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv") parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv") parser.add_argument("--out", default="repair_results/carmignac_report.html") args = parser.parse_args() # Resolve paths relative to this script's directory if files not found base = os.path.dirname(os.path.abspath(__file__)) def resolve(p): if os.path.exists(p): return p alt = os.path.join(base, p) if os.path.exists(alt): return alt sys.exit(f"[ERROR] File not found: {p}") scores_path = resolve(args.scores) mapping_path = resolve(args.mapping) surgery_path = resolve(args.surgery) print(f"[Load] scores : {scores_path}") print(f"[Load] mapping : {mapping_path}") print(f"[Load] surgery : {surgery_path}") scores, mapping, surgery = load_outputs(scores_path, mapping_path, surgery_path) analytics = compute_analytics(scores, mapping, surgery) print_summary(analytics, surgery) html = build_html(analytics, surgery, scores, mapping) out_path = args.out with open(out_path, "w", encoding="utf-8") as f: f.write(html) print(f"\n[Report] Written to → {out_path}") if __name__ == "__main__": main()