""" Helper methods that are used in the repair challenge """ import json import pandas as pd import numpy as np import s3fs import os def load_data_diagnostics(): fs = s3fs.S3FileSystem( client_kwargs={"endpoint_url": "https://" + "minio-simple.lab.groupe-genes.fr"}, key=os.environ["AWS_ACCESS_KEY_ID"], secret=os.environ["AWS_SECRET_ACCESS_KEY"], token=os.environ["AWS_SESSION_TOKEN"], ) with fs.open("projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv", "rb") as f: flows = pd.read_csv(f, sep=";") with fs.open("projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv", "rb") as f: aum = pd.read_csv(f, sep=";") aum["Centralisation Date"] = pd.to_datetime(aum["Centralisation Date"]) flows["Centralisation Date"] = pd.to_datetime(flows["Centralisation Date"]) return aum, flows def load_data_repair(): fs = s3fs.S3FileSystem( client_kwargs={"endpoint_url": "https://" + "minio-simple.lab.groupe-genes.fr"}, key=os.environ["AWS_ACCESS_KEY_ID"], secret=os.environ["AWS_SECRET_ACCESS_KEY"], token=os.environ["AWS_SESSION_TOKEN"], ) with fs.open("projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv", "rb") as f: flows = pd.read_csv(f, sep=";") with fs.open("projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv", "rb") as f: aum = pd.read_csv(f, sep=";") aum["Centralisation Date"] = pd.to_datetime(aum["Centralisation Date"]) flows["Centralisation Date"] = pd.to_datetime(flows["Centralisation Date"]) # Noms courts aum = aum.rename( columns={ "Registrar Account - ID": "reg_id", "Product - Isin": "isin", "Centralisation Date": "date", "Quantity - AUM": "qty_aum", "Value - AUM €": "val_eur", "Registrar Account - Region": "region", } ) flows = flows.rename( columns={ "Registrar Account - ID": "reg_id", "Product - Isin": "isin", "Centralisation Date": "date", "Quantity - NetFlows": "qty_net", "Value € - NetFlows": "val_net_eur", } ) aum["reg_id"] = aum["reg_id"].astype(str) flows["reg_id"] = flows["reg_id"].astype(str) return aum, flows def load_inputs_branch(mapping_path, surgery_path): fs = s3fs.S3FileSystem( client_kwargs={"endpoint_url": "https://" + "minio-simple.lab.groupe-genes.fr"}, key=os.environ["AWS_ACCESS_KEY_ID"], secret=os.environ["AWS_SECRET_ACCESS_KEY"], token=os.environ["AWS_SESSION_TOKEN"], ) with fs.open( "s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv", "rb" ) as f: aum = pd.read_csv(f, sep=";") mapping = pd.read_csv(mapping_path, parse_dates=["date"]) surgery = ( pd.read_csv(surgery_path, parse_dates=["date"]) if surgery_path else pd.DataFrame() ) # Normalise ID columns to string aum["Registrar Account - ID"] = aum["Registrar Account - ID"].astype(str) mapping["reg_orig"] = mapping["reg_orig"].astype(str) mapping["reg_used"] = mapping["reg_used"].astype(str) if not surgery.empty: surgery["reg_orig"] = surgery["reg_orig"].astype(str) surgery["reg_from"] = surgery["reg_from"].astype(str) surgery["reg_to"] = surgery["reg_to"].astype(str) return aum, mapping, surgery # ───────────────────────────────────────────────────────────── # BUILD HTML REPORT # ───────────────────────────────────────────────────────────── def build_html_diagnostics(df_broken, df_all, df_agg, df_err_isin, df_err_agg, alpha): # ── JS-ready data ──────────────────────────────────────────── # Timeline: n_broken and total_missing per month tl = ( df_all[df_all["broken"]] .groupby("date") .agg( n_broken=("isin", "count"), total_missing=("missing_flow", lambda x: x.abs().sum()), n_lag=("is_lag", "sum"), ) .reindex(df_all["date"].sort_values().unique()) .fillna(0) ) tl.index = pd.to_datetime(tl.index) dates_str = json.dumps([d.strftime("%Y-%m-%d") for d in tl.index]) def jf(arr, dec=4): return json.dumps( [round(float(v), dec) if not np.isnan(v) else None for v in arr] ) ISIN_COLORS = [ "#2563eb", "#16a34a", "#dc2626", "#d97706", "#7c3aed", "#0891b2", "#db2777", "#65a30d", "#ea580c", "#6366f1", ] n_broken_js = jf(tl["n_broken"].values, 0) total_miss_js = jf(tl["total_missing"].values) n_lag_js = jf(tl["n_lag"].values, 0) # Aggregate (cross-ISIN) JS data agg_dates_str = json.dumps( [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_agg["date"])] ) agg_delta_js = jf(df_agg["delta_aum"].values) agg_flow_js = jf(df_agg["flow_total"].values) agg_missing_js = jf(df_agg["missing_flow"].values) agg_pct_js = jf((df_agg["missing_pct"] * 100).values) # Aggregate KPIs n_agg_broken = int(df_agg["broken"].sum()) n_agg_lag = int(df_agg["is_lag"].sum()) n_agg_genuine = n_agg_broken - n_agg_lag max_agg_pct = float(df_agg["missing_pct"].max() * 100) if len(df_agg) else 0 # Aggregate detail table rows agg_rows = [] for _, r in df_agg[df_agg["broken"]].iterrows(): lb = 'lag' if r["is_lag"] else "" pc = "pct-high" if r["missing_pct"] > 0.1 else "pct-med" ds = ( r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10] ) mc = "miss-neg" if r["missing_flow"] < 0 else "miss-pos" agg_rows.append( f"{ds}" f'{r["q_total_prev"]:,.1f}' f'{r["q_total_curr"]:,.1f}' f'{r["flow_total"]:,.1f}' f'{r["missing_flow"]:+,.1f}' f'{r["missing_pct"] * 100:.2f}%' f"{lb}" ) agg_detail_rows = ( "".join(agg_rows) if agg_rows else ( '✓ No broken months at aggregate level' ) ) # ── Error account JS data ──────────────────────────────────── err_dates_str = json.dumps( [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])] ) err_agg_stock_js = jf(df_err_agg["stock_error_agg"].values) err_agg_res_js = jf(df_err_agg["residual_agg"].values) err_agg_pct_js = jf(df_err_agg["stock_error_agg_pct"].values) # Top 5 ISINs by max absolute stock error top_err_isins = ( df_err_isin.groupby("isin")["stock_error"] .apply(lambda x: x.abs().max()) .nlargest(5) .index.tolist() ) all_err_dates = sorted(df_err_isin["date"].unique()) err_isin_datasets = [] for idx, isin in enumerate(top_err_isins): sub = ( df_err_isin[df_err_isin["isin"] == isin] .set_index("date")["stock_error"] .reindex(all_err_dates) ) err_isin_datasets.append( { "label": isin, "data": [ round(float(v), 3) if not pd.isna(v) else None for v in sub.values ], "borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)], "backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22", "borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False, } ) err_isin_ts_json = json.dumps(err_isin_datasets) err_isin_dates_str = json.dumps( [ d.strftime("%Y-%m-%d") if hasattr(d, "strftime") else str(d)[:10] for d in all_err_dates ] ) # Error account KPIs max_agg_stock_err = float(df_err_agg["stock_error_agg"].abs().max()) max_agg_stock_pct = float(df_err_agg["stock_error_agg_pct"].max()) # Stationarity proxy: std / mean_abs (lower = more stationary) agg_std = float(df_err_agg["stock_error_agg"].std()) agg_mean = float(df_err_agg["stock_error_agg"].abs().mean()) stationarity = round(agg_std / max(agg_mean, 1e-9), 3) # Error account ISIN detail table (worst months per ISIN) err_worst = ( df_err_isin.assign(abs_stock=df_err_isin["stock_error"].abs()) .sort_values("abs_stock", ascending=False) .head(200) ) err_isin_rows = [] for _, r in err_worst.iterrows(): ds = ( r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10] ) sc = "miss-neg" if r["stock_error"] < 0 else "miss-pos" rc = "miss-neg" if r["residual"] < 0 else "miss-pos" pch = ( "pct-high" if r["stock_error_pct"] > 5 else ("pct-med" if r["stock_error_pct"] > 1 else "") ) err_isin_rows.append( f"{ds}" f'{r["isin"]}' f'{r["residual"]:+,.2f}' f'{r["stock_error"]:+,.2f}' f'{r["stock_error_pct"]:.3f}%' ) err_isin_detail = ( "".join(err_isin_rows) if err_isin_rows else ( '✓ Error account is flat (no residuals)' ) ) # Per-ISIN summary isin_sum = ( df_broken.groupby("isin") .agg( n_months=("date", "count"), avg_pct=("missing_pct", "mean"), total_abs=("missing_flow", lambda x: x.abs().sum()), ) .sort_values("total_abs", ascending=False) ) # Per-ISIN missing_pct timeseries for the top 5 ISINs top_isins = isin_sum.head(5).index.tolist() all_dates = sorted(df_all["date"].unique()) isin_ts_datasets = [] for idx, isin in enumerate(top_isins): sub = ( df_all[df_all["isin"] == isin] .set_index("date")["missing_pct"] .reindex(all_dates) .fillna(0) ) isin_ts_datasets.append( { "label": isin, "data": [round(float(v) * 100, 3) for v in sub.values], "borderColor": ISIN_COLORS[idx % len(ISIN_COLORS)], "backgroundColor": ISIN_COLORS[idx % len(ISIN_COLORS)] + "22", "borderWidth": 2, "pointRadius": 0, "tension": 0.3, "fill": False, } ) isin_ts_json = json.dumps(isin_ts_datasets) all_dates_str = json.dumps( [ d.strftime("%Y-%m-%d") if hasattr(d, "strftime") else str(d)[:10] for d in all_dates ] ) # Detail table rows detail_rows = "" for _, r in df_broken.head(200).iterrows(): lag_badge = 'lag' if r["is_lag"] else "" pct_class = "pct-high" if r["missing_pct"] > 0.1 else "pct-med" detail_rows += f""" {r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10]} {r["isin"]} {r["q_agg_prev"]:,.1f} {r["q_agg_curr"]:,.1f} {r["flow_agg"]:,.1f} {r["missing_flow"]:+,.1f} {r["missing_pct"] * 100:.2f}% {lag_badge} """ # ISIN summary table isin_rows = "" for isin, row in isin_sum.iterrows(): isin_rows += f""" {isin} {int(row["n_months"])} {row["avg_pct"] * 100:.2f}% {row["total_abs"]:,.1f} """ # KPIs total = len(df_all) n_broken_kpi = len(df_broken) n_lag_kpi = int(df_broken["is_lag"].sum()) n_genuine = n_broken_kpi - n_lag_kpi max_pct = df_broken["missing_pct"].max() * 100 if len(df_broken) else 0 n_isins = df_broken["isin"].nunique() no_broken_msg = "" if n_broken_kpi == 0: no_broken_msg = '
✓ No broken months detected at this threshold.
' html = f""" Carmignac — Broken Months Diagnostics
Carmignac × ENSAE · Data Challenge 2025

Broken Months Diagnostics

Aggregate stock-flow equation check · ISIN level · threshold α = {alpha:.1%}
Missing % = |missing flow| / max(|ΔAUM|, |recorded flow|, 1 share) — capped at movement size, not stock level
(ISIN, month) pairs {total:,} examined
Broken months 0 else "success"}">{ n_broken_kpi:,} {n_broken_kpi / total * 100:.1f}% of pairs
Likely lags {n_lag_kpi} resolved by ±{3}d window
Genuine gaps 0 else "success"}">{ n_genuine } unresolved by lag fix
ISINs affected {n_isins} distinct ISINs
Max missing % 10 else "warn"}">{max_pct:.1f}% worst single (isin, month)
00 · Error account — cumulative residuals
Aggregate error account stock over time Stock_error(t_ref) = 0 by definition. At each prior month, the stock absorbs the residual [ΔQ_total − F_total]. A stationary signal near zero = clean data. A drifting signal = structural data quality problem.
Max |stock error|
{ max_agg_stock_err:,.1f} shares
Max % of total AUM
5 else "var(--warn)" }">{max_agg_stock_pct:.3f}%
Stationarity (σ/μ)
{stationarity:.3f}
lower = more stationary
Monthly aggregate residual ΔQ_total − F_total per month (should be near zero)
Error stock — top 5 ISINs Cumulative error stock per ISIN (most affected)
Error account detail — worst (ISIN, month) pairs Sorted by absolute cumulative error stock. stock_error_pct = |stock| / max(ISIN AUM)
{err_isin_detail}
DateISIN Monthly residual Cumulative stock % of max AUM
01 · Aggregate view — all ISINs combined
Stock-flow equation — total portfolio Σ Q(t) − Σ Q(t−1) vs Σ F(t) across all ISINs and accounts. Detects months where the global portfolio is incoherent, independent of ISIN-level breakdown.
Aggregate missing flow over time Σ Q(t) − Σ Q(t−1) − Σ F(t) — should be near zero every month
Aggregate missing % of movement |missing| / max(|ΔAUM|, |flow|) — months above α flagged in red
Aggregate broken months — detail
{agg_detail_rows}
Date Σ Q(t−1)Σ Q(t) Σ FlowMissing Missing %
01 · Timeline — per ISIN
Broken (isin, month) pairs per month Stacked: genuine gaps (red) vs likely accounting lags (amber)
Total absolute missing flow per month Sum of |missing flow| across all broken ISINs
Missing % — top 5 ISINs over time |missing flow| / max(|ΔAUM|, |recorded flow|) per ISIN — capped at movement size
02 · By ISIN
ISIN summary — most affected
{ '
No broken months detected.
' if n_broken_kpi == 0 else f''' {isin_rows}
ISINBroken months Avg missing %Total |missing| (shares)
''' }
03 · Detail log
All broken (isin, month) pairs lag = likely resolved by extending flow window ±3 days
Threshold α = {alpha:.1%} · showing up to 200 rows
{ '
✓ No broken months detected at this threshold.
' if n_broken_kpi == 0 else f''' {detail_rows}
DateISIN Q(t-1)Q(t) Net flowMissing Missing % of movement
''' }
""" return html def build_html_repair(analytics, surgery, scores, mapping, df_err_isin=None, df_err_agg=None): tl = analytics["timeline"] ss = analytics["surgery_stats"] piv = analytics["pivot"] ch = analytics["churn"] dates_str = analytics["dates"] # ── helpers to serialise for JS ───────────────────────────── def jf(arr, decimals=6): return json.dumps( [round(float(v), decimals) if not np.isnan(v) else None for v in arr] ) def js(arr): return json.dumps(list(arr)) # ── colour palette ─────────────────────────────────────────── REG_COLORS = [ "#2563eb", "#16a34a", "#dc2626", "#d97706", "#7c3aed", "#0891b2", "#db2777", "#65a30d", "#ea580c", "#6366f1", "#059669", "#b45309", "#9333ea", "#0284c7", "#e11d48", ] # ── 4.1 Surgery sparkline data ────────────────────────────── surg_dates = [d.strftime("%Y-%m-%d") for d in ss.index] n_surg = jf(ss["n_surgeries"].values, 0) total_gain = jf(ss["total_gain"].values) avg_gain = jf(ss["avg_gain"].values) avg_jaccard = jf(ss["avg_jaccard"].values) # ── 4.2 Individual trajectories ──────────────────────────── reg_ids = list(piv.columns) traj_datasets = [] # Surgery lookup: reg_orig -> list of {date, from, to, composite} surg_by_reg = {} for _, row in surgery.iterrows(): surg_by_reg.setdefault(row["reg_orig"], []).append( { "date": row["date"].strftime("%Y-%m-%d"), "reg_from": str(row["reg_from"]), "reg_to": str(row["reg_to"]), "composite": round(float(row["jaccard_composite"]), 4), "gain": round(float(row["gain_vs_no_surgery"]), 6), } ) for idx, rid in enumerate(reg_ids): remapped = rid in analytics["ever_remapped"] traj_datasets.append( { "label": rid, "data": [ round(float(v), 6) if not np.isnan(v) else None for v in piv[rid].values ], "borderColor": REG_COLORS[idx % len(REG_COLORS)], "backgroundColor": REG_COLORS[idx % len(REG_COLORS)] + "22", "borderWidth": 2, "borderDash": [6, 3] if remapped else [], "pointRadius": 0, "tension": 0.3, "fill": False, "remapped": remapped, "surgeries": surg_by_reg.get(rid, []), } ) traj_json = json.dumps(traj_datasets) # ── 4.2b Error account data (optional) ──────────────────── has_error = df_err_isin is not None and df_err_agg is not None if has_error: err_dates = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df_err_agg["date"])] err_agg_stock = [ round(float(v), 3) if not pd.isna(v) else None for v in df_err_agg["stock_error_agg"].values ] err_agg_res = [ round(float(v), 3) if not pd.isna(v) else None for v in df_err_agg["residual_agg"].values ] err_agg_pct = [ round(float(v), 4) if not pd.isna(v) else None for v in df_err_agg["stock_error_agg_pct"].values ] # Top 5 ISINs by max |stock error| top_err = ( df_err_isin.groupby("isin")["stock_error"] .apply(lambda x: x.abs().max()) .nlargest(5) .index.tolist() ) all_err_dates = sorted(df_err_isin["date"].unique()) ERR_COLORS = ["#ef4444", "#f59e0b", "#8b5cf6", "#06b6d4", "#10b981"] err_isin_ds = [] for idx, isin in enumerate(top_err): sub = ( df_err_isin[df_err_isin["isin"] == isin] .set_index("date")["stock_error"] .reindex(all_err_dates) ) err_isin_ds.append( { "label": isin, "data": [ round(float(v), 3) if not pd.isna(v) else None for v in sub.values ], "borderColor": ERR_COLORS[idx % len(ERR_COLORS)], "backgroundColor": ERR_COLORS[idx % len(ERR_COLORS)] + "22", "borderWidth": 1.5, "pointRadius": 0, "tension": 0.3, "fill": False, } ) max_err_stock = float(df_err_agg["stock_error_agg"].abs().max()) max_err_pct = float(df_err_agg["stock_error_agg_pct"].max()) agg_std = float(df_err_agg["stock_error_agg"].std()) agg_mean = float(df_err_agg["stock_error_agg"].abs().mean()) stationarity = round(agg_std / max(agg_mean, 1e-9), 3) err_dates_js = json.dumps(err_dates) err_agg_stock_js = json.dumps(err_agg_stock) err_agg_res_js = json.dumps(err_agg_res) err_agg_pct_js = json.dumps(err_agg_pct) err_isin_ds_js = json.dumps(err_isin_ds) err_isin_dates_js = json.dumps( [ d.strftime("%Y-%m-%d") if hasattr(d, "strftime") else str(d)[:10] for d in all_err_dates ] ) # ISIN detail table (top 100 worst) err_rows = [] for _, r in ( df_err_isin.assign(abs_s=df_err_isin["stock_error"].abs()) .sort_values("abs_s", ascending=False) .head(100) .iterrows() ): ds = ( r["date"].strftime("%Y-%m-%d") if hasattr(r["date"], "strftime") else str(r["date"])[:10] ) sc = "color:var(--danger)" if r["stock_error"] < 0 else "color:var(--warn)" rc = "color:var(--danger)" if r["residual"] < 0 else "color:var(--warn)" pch = ( "color:var(--danger);font-weight:600" if r["stock_error_pct"] > 5 else ("color:var(--warn)" if r["stock_error_pct"] > 1 else "") ) err_rows.append( f"{ds}" f'{r["isin"]}' f'{r["residual"]:+,.2f}' f'{r["stock_error"]:+,.2f}' f'{r["stock_error_pct"]:.3f}%' f"" ) err_isin_detail = ( "".join(err_rows) if err_rows else ( '✓ Error account is flat' ) ) # HTML block for error account section err_section_html = f"""
06 · Error Account
Aggregate error account stock Stock_error(t_ref) = 0. The stock absorbs unreconciled residuals going backwards. A flat signal near zero = clean data. A drift = structural gap.
Max |error stock|
{max_err_stock:,.1f} shares
Max % of total AUM
5 else "var(--warn)"}">{max_err_pct:.3f}%
Stationarity σ/μ
{stationarity:.3f}
lower = more stationary
Monthly aggregate residual ΔQ_total − F_total per month
Error stock — top 5 ISINs Cumulative error stock per ISIN
Error account detail — worst (ISIN, month) pairs
{err_isin_detail}
DateISIN Monthly residual Cumul. stock % of max AUM
""" # JS block for error account charts err_js_block = f""" // ── 8. Error account charts ────────────────────────────────── const ERR_DATES = {err_dates_js}; const ERR_AGG_STOCK = {err_agg_stock_js}; const ERR_AGG_RES = {err_agg_res_js}; const ERR_ISIN_TS = {err_isin_ds_js}; const ERR_ISIN_DATES = {err_isin_dates_js}; new Chart(document.getElementById('chartErrStock'), {{ type: 'line', data: {{ labels: ERR_DATES, datasets: [{{ label: 'Aggregate error stock', data: ERR_AGG_STOCK, borderColor: '#ef4444', backgroundColor: '#ef444415', borderWidth: 2, pointRadius: 0, tension: 0.3, fill: true }}] }}, options: {{ responsive: true, maintainAspectRatio: false, interaction: {{mode:'index', intersect:false}}, plugins: {{ legend: {{display:false}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: {{ ...yAxis('Shares'), grid: {{ color: ctx => ctx.tick.value === 0 ? '#ffffff55' : '#1a2030', lineWidth: ctx => ctx.tick.value === 0 ? 1.5 : 1 }} }} }} }} }}); new Chart(document.getElementById('chartErrRes'), {{ type: 'bar', data: {{ labels: ERR_DATES, datasets: [{{ label: 'Monthly residual', data: ERR_AGG_RES, backgroundColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef444488' : '#f59e0b88'), borderColor: ERR_AGG_RES.map(v => v != null && v < 0 ? '#ef4444' : '#f59e0b'), borderWidth: 1, borderRadius: 2 }}] }}, options: {{ responsive: true, maintainAspectRatio: false, plugins: {{ legend: {{display:false}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: yAxis('Shares') }} }} }}); new Chart(document.getElementById('chartErrIsin'), {{ type: 'line', data: {{ labels: ERR_ISIN_DATES, datasets: ERR_ISIN_TS }}, options: {{ responsive: true, maintainAspectRatio: false, interaction: {{mode:'index', intersect:false}}, plugins: {{ legend: {{position:'right', labels:{{boxWidth:10, padding:8, font:{{size:10}}}}}}, tooltip: tooltip() }}, scales: {{ x: timeAxis(), y: yAxis('Error stock (shares)') }} }} }});""" else: err_section_html = "" err_js_block = "" # ── 4.3 Surgery detail table rows ────────────────────────── sd = analytics["surgery_detail"].sort_values("date") surg_rows_html = "" if len(sd) == 0: surg_rows_html = "No surgeries performed" else: for _, r in sd.iterrows(): gain_class = "gain-high" if r["gain_vs_no_surgery"] > 0.05 else "gain-low" lb = int(r.get("lookback_months", 1)) lb_cell = ( f'{lb}m' if lb > 1 else "—" ) surg_rows_html += f""" {r["date"].date()} {r["reg_orig"]} {r["reg_from"]} → {r["reg_to"]} {r["jaccard_composite"]:.4f} +{r["gain_vs_no_surgery"]:.6f} {r["gain_pct_of_score"]:.1f}% {lb_cell} """ # ── 4.4 Top accounts table ────────────────────────────────── last_date = piv.index.max() top_accounts = piv.loc[last_date].dropna().sort_values(ascending=False) top_rows_html = "" for rank, (rid, sc) in enumerate(top_accounts.items(), 1): remapped = "✓" if rid in analytics["ever_remapped"] else "" bar_w = int(sc / top_accounts.max() * 100) color = REG_COLORS[(rank - 1) % len(REG_COLORS)] top_rows_html += f""" #{rank} {rid} {sc:.6f}
{remapped} """ # ───────────────────────────────────────────────────────────── # HTML TEMPLATE # ───────────────────────────────────────────────────────────── html = f""" Carmignac Pipeline — Analysis Report
Carmignac × ENSAE · Data Challenge 2025

Pipeline Results — Analysis Report

Registrar ID repair · Score propagation · Surgery audit
Σ score at t_ref {tl["sum_post"].iloc[-1]:.4f} post-surgery
Σ score at t_min {tl["sum_post"].iloc[0]:.4f} post-surgery
Max recovery {tl["recovery_pct"].max():.1f}% score rescued by surgery
Total surgeries {len(surgery)} operations performed
Reg IDs universe {piv.shape[1]} at reference date
Ever remapped {len(analytics["ever_remapped"])} reg IDs w/ code change
01 · Score Integrity Over Time
Sum of scores — pre vs post surgery Post-surgery (solid) shows the corrected score after code repairs. Pre-surgery (dashed) is the counterfactual without any remapping. Gap = score rescued.
Score recovered by surgery Difference post − pre at each date
Portfolio concentration (entropy) Shannon entropy of score distribution — higher = more spread
02 · Individual Score Trajectories
Score explorer — per Registrar Account Click an account to inspect its full history. ◆ remapped = surgery was applied.
03 · Surgery Operations
Surgeries per time step Number of code remappings performed at each month
Score gain per surgery Average gain in Σ score from surgery at each month
Jaccard similarity of surgery matches Composite Jaccard score of the matched code pair — closer to 1.0 = stronger portfolio overlap. Low values may indicate uncertain matches.
04 · Surgery Detail Log
All surgery operations
{ '
No surgeries were performed on this dataset.
' if len(surgery) == 0 else f''' {surg_rows_html}
Date Reg orig Code from Code to Jaccard Score gain % of score Lookback
''' }
05 · Score Ranking at t_ref
Accounts ranked by weight at reference date ✓ in last column = account was remapped at some point in history
{top_rows_html}
Rank Registrar ID Score (weight) Relative size Remapped
{err_section_html}
""" return html