"""
Carmignac Data Challenge — Pipeline Results Analysis
=====================================================
Analyses the CSV outputs produced by carmignac_repair.py:
- carmignac_scores.csv (post-surgery score history)
- carmignac_mapping.csv (reg_id mapping history)
- carmignac_surgery_log.csv (surgery operations)
Produces a self-contained HTML report with interactive charts.
Usage:
python carmignac_analysis.py
python carmignac_analysis.py --scores path/to/scores.csv \
--mapping path/to/mapping.csv \
--surgery path/to/surgery_log.csv \
--out report.html
"""
import argparse
import json
import os
import sys
import numpy as np
import pandas as pd
# ─────────────────────────────────────────────────────────────
# 1. LOAD & VALIDATE
# ─────────────────────────────────────────────────────────────
def load_outputs(scores_path, mapping_path, surgery_path):
scores = pd.read_csv(scores_path, parse_dates=["date"])
mapping = pd.read_csv(mapping_path, parse_dates=["date"])
surgery = pd.read_csv(surgery_path, parse_dates=["date"])
# Normalise dtypes
scores["reg_id"] = scores["reg_id"].astype(str)
mapping["reg_orig"] = mapping["reg_orig"].astype(str)
mapping["reg_used"] = mapping["reg_used"].astype(str)
mapping["changed"] = mapping["changed"].astype(bool)
surgery["reg_orig"] = surgery["reg_orig"].astype(str)
surgery["reg_from"] = surgery["reg_from"].astype(str)
surgery["reg_to"] = surgery["reg_to"].astype(str)
return scores, mapping, surgery
# ─────────────────────────────────────────────────────────────
# 2. COMPUTE ANALYTICS
# ─────────────────────────────────────────────────────────────
def compute_analytics(scores, mapping, surgery):
dates = sorted(scores["date"].unique())
# ── 2.1 Sum of scores per date (post-surgery) ──────────────
sum_post = (scores.groupby("date")["score"]
.sum()
.reindex(dates)
.rename("sum_post"))
# ── 2.2 Reconstruct pre-surgery (counterfactual) ───────────
# Without surgery, every reg_id that had a hard break would score 0
# from that date backwards. We propagate the surgery "gain" as a
# cumulative deficit going back in time.
gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum()
# cumulative deficit = sum of gains for all surgeries at or after date t
cumulative_deficit = pd.Series(0.0, index=dates)
for d in dates:
cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum()
sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre")
timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre})
timeline.index = pd.to_datetime(timeline.index)
timeline["recovery_pct"] = np.where(
sum_pre < sum_post,
(sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100,
0.0,
)
# ── 2.3 Per-date surgery stats ─────────────────────────────
surgery_stats = (
surgery.groupby("date")
.agg(
n_surgeries = ("reg_orig", "count"),
total_gain = ("gain_vs_no_surgery", "sum"),
avg_gain = ("gain_vs_no_surgery", "mean"),
avg_jaccard = ("jaccard_composite", "mean"),
avg_score_before = ("score_before", "mean"),
avg_score_after = ("score_after", "mean"),
)
.reindex(dates, fill_value=0)
)
# ── 2.4 Score distribution over time ───────────────────────
# Wide format: rows=dates, cols=reg_ids
pivot = scores.pivot_table(index="date", columns="reg_id",
values="score", aggfunc="last")
pivot = pivot.reindex(dates)
pivot.index = pd.to_datetime(pivot.index)
# ── 2.5 Mapping churn ──────────────────────────────────────
# For each date, how many reg_ids are remapped (not using their original code)?
churn = (mapping.groupby("date")["changed"]
.sum()
.reindex(dates, fill_value=0)
.rename("n_remapped"))
# ── 2.6 Score entropy (distribution spread) ────────────────
def entropy(row):
p = row.dropna()
p = p[p > 0]
if len(p) == 0:
return np.nan
p = p / p.sum()
return -(p * np.log(p)).sum()
timeline["entropy"] = pivot.apply(entropy, axis=1).values
# ── 2.7 Individual score trajectories ──────────────────────
# Identify which reg_ids were ever remapped
ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique())
# ── 2.8 Surgery detail table ───────────────────────────────
surgery_detail = surgery.copy()
surgery_detail["gain_pct_of_score"] = (
surgery_detail["gain_vs_no_surgery"]
/ surgery_detail["score_before"].clip(lower=1e-9) * 100
).round(2)
return {
"timeline": timeline,
"surgery_stats": surgery_stats,
"pivot": pivot,
"churn": churn,
"ever_remapped": ever_remapped,
"surgery_detail": surgery_detail,
"dates": [d.strftime("%Y-%m-%d") for d in dates],
}
# ─────────────────────────────────────────────────────────────
# 3. PRINT CONSOLE SUMMARY
# ─────────────────────────────────────────────────────────────
def print_summary(analytics, surgery):
tl = analytics["timeline"]
ss = analytics["surgery_stats"]
print("\n" + "=" * 65)
print(" CARMIGNAC PIPELINE — RESULTS SUMMARY")
print("=" * 65)
print(f"\n Date range : {tl.index.min().date()} → {tl.index.max().date()}")
print(f" Total months : {len(tl)}")
print(f" Reg IDs : {analytics['pivot'].shape[1]}")
print(f"\n ── Score (Σ) ──────────────────────────────────────────")
print(f" At t_ref (latest) : {tl['sum_post'].iloc[-1]:.6f}")
print(f" At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}")
print(f" Min (post-surgery) : {tl['sum_post'].min():.6f} "
f"({tl['sum_post'].idxmin().date()})")
print(f" Min (pre-surgery) : {tl['sum_pre'].min():.6f} "
f"({tl['sum_pre'].idxmin().date()})")
print(f" Max recovery (pct) : {tl['recovery_pct'].max():.2f}%")
print(f"\n ── Surgeries ─────────────────────────────────────────")
if len(surgery) == 0:
print(" No surgeries performed.")
else:
print(f" Total operations : {len(surgery)}")
print(f" Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}")
print(f" Avg Jaccard : {surgery['jaccard_composite'].mean():.4f}")
print(f" Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}")
print()
print(f" {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} "
f"{'Jaccard':>8s} {'Gain':>10s}")
print(" " + "-" * 78)
for _, row in surgery.sort_values("date").iterrows():
print(f" {str(row['date'].date()):12s} {row['reg_orig']:12s} "
f"{row['reg_from']:15s} {row['reg_to']:15s} "
f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}")
print(f"\n ── Mapping churn ─────────────────────────────────────")
ch = analytics["churn"]
print(f" Max remapped at one date : {int(ch.max())} ({ch.idxmax().date() if ch.max()>0 else 'N/A'})")
print(f" Reg IDs ever remapped : {len(analytics['ever_remapped'])}")
print(f"\n ── Score entropy (distribution spread) ───────────────")
ent = analytics["timeline"]["entropy"]
print(f" Mean entropy : {ent.mean():.4f}")
print(f" Std entropy : {ent.std():.4f}")
print()
# ─────────────────────────────────────────────────────────────
# 4. BUILD HTML REPORT
# ─────────────────────────────────────────────────────────────
def build_html(analytics, surgery, scores, mapping):
tl = analytics["timeline"]
ss = analytics["surgery_stats"]
piv = analytics["pivot"]
ch = analytics["churn"]
dates_str = analytics["dates"]
# ── helpers to serialise for JS ─────────────────────────────
def jf(arr, decimals=6):
return json.dumps([round(float(v), decimals) if not np.isnan(v) else None
for v in arr])
def js(arr):
return json.dumps(list(arr))
# ── colour palette ───────────────────────────────────────────
REG_COLORS = [
"#2563eb","#16a34a","#dc2626","#d97706","#7c3aed",
"#0891b2","#db2777","#65a30d","#ea580c","#6366f1",
"#059669","#b45309","#9333ea","#0284c7","#e11d48",
]
# ── 4.1 Surgery sparkline data ──────────────────────────────
surg_dates = [d.strftime("%Y-%m-%d") for d in ss.index]
n_surg = jf(ss["n_surgeries"].values, 0)
total_gain = jf(ss["total_gain"].values)
avg_gain = jf(ss["avg_gain"].values)
avg_jaccard = jf(ss["avg_jaccard"].values)
# ── 4.2 Individual trajectories ────────────────────────────
reg_ids = list(piv.columns)
traj_datasets = []
for idx, rid in enumerate(reg_ids):
col = analytics["ever_remapped"]
dashed = rid in col
traj_datasets.append({
"label": rid,
"data": [round(float(v), 6) if not np.isnan(v) else None
for v in piv[rid].values],
"borderColor": REG_COLORS[idx % len(REG_COLORS)],
"backgroundColor": REG_COLORS[idx % len(REG_COLORS)] + "22",
"borderWidth": 2 if not dashed else 2,
"borderDash": [] if not dashed else [6, 3],
"pointRadius": 0,
"tension": 0.3,
"fill": False,
})
traj_json = json.dumps(traj_datasets)
# ── 4.3 Surgery detail table rows ──────────────────────────
sd = analytics["surgery_detail"].sort_values("date")
surg_rows_html = ""
if len(sd) == 0:
surg_rows_html = "
| No surgeries performed |
"
else:
for _, r in sd.iterrows():
gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low"
surg_rows_html += f"""
| {r['date'].date()} |
{r['reg_orig']} |
{r['reg_from']} |
→ |
{r['reg_to']} |
{r['jaccard_composite']:.4f} |
+{r['gain_vs_no_surgery']:.6f} |
{r['gain_pct_of_score']:.1f}% |
"""
# ── 4.4 Top accounts table ──────────────────────────────────
last_date = piv.index.max()
top_accounts = piv.loc[last_date].dropna().sort_values(ascending=False)
top_rows_html = ""
for rank, (rid, sc) in enumerate(top_accounts.items(), 1):
remapped = "✓" if rid in analytics["ever_remapped"] else ""
bar_w = int(sc / top_accounts.max() * 100)
color = REG_COLORS[(rank - 1) % len(REG_COLORS)]
top_rows_html += f"""
| #{rank} |
{rid} |
{sc:.6f} |
|
{remapped} |
"""
# ─────────────────────────────────────────────────────────────
# HTML TEMPLATE
# ─────────────────────────────────────────────────────────────
html = f"""
Carmignac Pipeline — Analysis Report
Σ score at t_ref
{tl['sum_post'].iloc[-1]:.4f}
post-surgery
Σ score at t_min
{tl['sum_post'].iloc[0]:.4f}
post-surgery
Max recovery
{tl['recovery_pct'].max():.1f}%
score rescued by surgery
Total surgeries
{len(surgery)}
operations performed
Reg IDs universe
{piv.shape[1]}
at reference date
Ever remapped
{len(analytics['ever_remapped'])}
reg IDs w/ code change
01 · Score Integrity Over Time
02 · Individual Score Trajectories
03 · Surgery Operations
04 · Surgery Detail Log
{'
No surgeries were performed on this dataset.
' if len(surgery) == 0 else f"""
| Date |
Reg orig |
Code from |
|
Code to |
Jaccard |
Score gain |
% of score |
{surg_rows_html}
"""}
05 · Score Ranking at t_ref
| Rank |
Registrar ID |
Score (weight) |
Relative size |
Remapped |
{top_rows_html}
"""
return html
# ─────────────────────────────────────────────────────────────
# 5. MAIN
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser")
parser.add_argument("--scores", default="repair_results/carmignac_scores.csv")
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
parser.add_argument("--out", default="repair_results/carmignac_report.html")
args = parser.parse_args()
# Resolve paths relative to this script's directory if files not found
base = os.path.dirname(os.path.abspath(__file__))
def resolve(p):
if os.path.exists(p):
return p
alt = os.path.join(base, p)
if os.path.exists(alt):
return alt
sys.exit(f"[ERROR] File not found: {p}")
scores_path = resolve(args.scores)
mapping_path = resolve(args.mapping)
surgery_path = resolve(args.surgery)
print(f"[Load] scores : {scores_path}")
print(f"[Load] mapping : {mapping_path}")
print(f"[Load] surgery : {surgery_path}")
scores, mapping, surgery = load_outputs(scores_path, mapping_path, surgery_path)
analytics = compute_analytics(scores, mapping, surgery)
print_summary(analytics, surgery)
html = build_html(analytics, surgery, scores, mapping)
out_path = args.out
with open(out_path, "w", encoding="utf-8") as f:
f.write(html)
print(f"\n[Report] Written to → {out_path}")
if __name__ == "__main__":
main()