Final version of the cleaning steps
This commit is contained in:
parent
53668dd6ee
commit
c2efab321b
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,131 +0,0 @@
|
||||||
date,residual_agg,stock_error_agg,stock_error_agg_pct
|
|
||||||
2015-01-31,0.0,148292.6214,0.0363
|
|
||||||
2015-02-28,-71073.9175,77218.7039,0.0189
|
|
||||||
2015-03-31,-38212.0374,39006.6664,0.0096
|
|
||||||
2015-04-30,-93289.8893,-54283.2228,0.0133
|
|
||||||
2015-05-31,80779.4715,26496.2486,0.0065
|
|
||||||
2015-06-30,210846.4765,237342.7252,0.0581
|
|
||||||
2015-07-31,-237021.1601,321.565,0.0001
|
|
||||||
2015-08-31,52430.1346,52751.6996,0.0129
|
|
||||||
2015-09-30,115091.9839,167843.6835,0.0411
|
|
||||||
2015-10-31,362979.7972,530823.4807,0.1301
|
|
||||||
2015-11-30,89787.3998,620610.8804,0.1521
|
|
||||||
2015-12-31,-548821.2591,71789.6214,0.0176
|
|
||||||
2016-01-31,114438.8471,186228.4684,0.0456
|
|
||||||
2016-02-29,-94160.3412,92068.1273,0.0226
|
|
||||||
2016-03-31,197803.9351,289872.0624,0.071
|
|
||||||
2016-04-30,75620.3391,365492.4015,0.0895
|
|
||||||
2016-05-31,1277.0285,366769.4301,0.0899
|
|
||||||
2016-06-30,28344.6474,395114.0775,0.0968
|
|
||||||
2016-07-31,-455791.3682,-60677.2908,0.0149
|
|
||||||
2016-08-31,118896.6804,58219.3897,0.0143
|
|
||||||
2016-09-30,-433713.5055,-375494.1158,0.092
|
|
||||||
2016-10-31,87253.3822,-288240.7336,0.0706
|
|
||||||
2016-11-30,197373.3618,-90867.3718,0.0223
|
|
||||||
2016-12-31,299519.7371,208652.3652,0.0511
|
|
||||||
2017-01-31,-67316.5443,141335.8209,0.0346
|
|
||||||
2017-02-28,-294805.5376,-153469.7167,0.0376
|
|
||||||
2017-03-31,-363088.0453,-516557.762,0.1266
|
|
||||||
2017-04-30,187743.9712,-328813.7908,0.0806
|
|
||||||
2017-05-31,9241.0742,-319572.7166,0.0783
|
|
||||||
2017-06-30,-33028.3212,-352601.0378,0.0864
|
|
||||||
2017-07-31,-218597.21,-571198.2478,0.1399
|
|
||||||
2017-08-31,273953.3009,-297244.9469,0.0728
|
|
||||||
2017-09-30,115964.9839,-181279.963,0.0444
|
|
||||||
2017-10-31,-23621.0098,-204900.9728,0.0502
|
|
||||||
2017-11-30,95815.8482,-109085.1246,0.0267
|
|
||||||
2017-12-31,-121404.417,-230489.5416,0.0565
|
|
||||||
2018-01-31,1075741.0088,845251.4672,0.2071
|
|
||||||
2018-02-28,390784.0012,1236035.4684,0.3028
|
|
||||||
2018-03-31,455651.3899,1691686.8583,0.4145
|
|
||||||
2018-04-30,-374980.7,1316706.1583,0.3226
|
|
||||||
2018-05-31,209864.386,1526570.5443,0.374
|
|
||||||
2018-06-30,111872.6628,1638443.207,0.4014
|
|
||||||
2018-07-31,-158631.7954,1479811.4116,0.3626
|
|
||||||
2018-08-31,300388.692,1780200.1036,0.4362
|
|
||||||
2018-09-30,-240137.9416,1540062.1621,0.3773
|
|
||||||
2018-10-31,-393725.0807,1146337.0813,0.2809
|
|
||||||
2018-11-30,-217887.9315,928449.1499,0.2275
|
|
||||||
2018-12-31,577035.3311,1505484.4809,0.3688
|
|
||||||
2019-01-31,-374117.7166,1131366.7643,0.2772
|
|
||||||
2019-02-28,214587.8043,1345954.5686,0.3298
|
|
||||||
2019-03-31,-168958.6343,1176995.9343,0.2884
|
|
||||||
2019-04-30,459977.1588,1636973.0931,0.4011
|
|
||||||
2019-05-31,-67477.1479,1569495.9453,0.3845
|
|
||||||
2019-06-30,-535961.896,1033534.0493,0.2532
|
|
||||||
2019-07-31,-61034.0471,972500.0021,0.2383
|
|
||||||
2019-08-31,17046.9276,989546.9297,0.2424
|
|
||||||
2019-09-30,-72796.0114,916750.9183,0.2246
|
|
||||||
2019-10-31,809108.5438,1725859.4622,0.4228
|
|
||||||
2019-11-30,43679.2428,1769538.705,0.4335
|
|
||||||
2019-12-31,1288397.5385,3057936.2435,0.7492
|
|
||||||
2020-01-31,-973899.4752,2084036.7683,0.5106
|
|
||||||
2020-02-29,864992.3304,2949029.0987,0.7225
|
|
||||||
2020-03-31,-802670.8766,2146358.2221,0.5259
|
|
||||||
2020-04-30,-245269.422,1901088.8001,0.4658
|
|
||||||
2020-05-31,-262548.2012,1638540.5989,0.4014
|
|
||||||
2020-06-30,577667.5612,2216208.1601,0.543
|
|
||||||
2020-07-31,148208.9581,2364417.1182,0.5793
|
|
||||||
2020-08-31,-110665.7234,2253751.3948,0.5522
|
|
||||||
2020-09-30,-93719.2628,2160032.132,0.5292
|
|
||||||
2020-10-31,360937.203,2520969.335,0.6176
|
|
||||||
2020-11-30,-209972.8202,2310996.5148,0.5662
|
|
||||||
2020-12-31,204577.9672,2515574.482,0.6163
|
|
||||||
2021-01-31,114754.1508,2630328.6328,0.6444
|
|
||||||
2021-02-28,-369099.3196,2261229.3132,0.554
|
|
||||||
2021-03-31,285801.7938,2547031.1069,0.624
|
|
||||||
2021-04-30,-525497.0146,2021534.0923,0.4953
|
|
||||||
2021-05-31,155798.0056,2177332.098,0.5334
|
|
||||||
2021-06-30,-596321.249,1581010.849,0.3873
|
|
||||||
2021-07-31,651201.5417,2232212.3907,0.5469
|
|
||||||
2021-08-31,196268.6348,2428481.0255,0.595
|
|
||||||
2021-09-30,-427350.6319,2001130.3936,0.4903
|
|
||||||
2021-10-31,-612699.7898,1388430.6037,0.3402
|
|
||||||
2021-11-30,-245857.591,1142573.0127,0.2799
|
|
||||||
2021-12-31,166235.742,1308808.7547,0.3207
|
|
||||||
2022-01-31,1057051.561,2365860.3157,0.5796
|
|
||||||
2022-02-28,247946.452,2613806.7677,0.6404
|
|
||||||
2022-03-31,375938.62,2989745.3877,0.7325
|
|
||||||
2022-04-30,-1187760.361,1801985.0267,0.4415
|
|
||||||
2022-05-31,85351.637,1887336.6637,0.4624
|
|
||||||
2022-06-30,-1119375.9,767960.7637,0.1882
|
|
||||||
2022-07-31,472271.327,1240232.0907,0.3039
|
|
||||||
2022-08-31,-74701.74,1165530.3507,0.2856
|
|
||||||
2022-09-30,293426.2139,1458956.5646,0.3574
|
|
||||||
2022-10-31,-116613.2208,1342343.3438,0.3289
|
|
||||||
2022-11-30,-274268.7771,1068074.5667,0.2617
|
|
||||||
2022-12-31,117575.457,1185650.0237,0.2905
|
|
||||||
2023-01-31,435788.475,1621438.4987,0.3973
|
|
||||||
2023-02-28,144058.704,1765497.2027,0.4325
|
|
||||||
2023-03-31,18136.618,1783633.8207,0.437
|
|
||||||
2023-04-30,56736.261,1840370.0817,0.4509
|
|
||||||
2023-05-31,178492.877,2018862.9587,0.4946
|
|
||||||
2023-06-30,33137.773,2052000.7317,0.5027
|
|
||||||
2023-07-31,-931401.398,1120599.3337,0.2745
|
|
||||||
2023-08-31,-195355.266,925244.0677,0.2267
|
|
||||||
2023-09-30,160732.769,1085976.8367,0.2661
|
|
||||||
2023-10-31,59094.801,1145071.6377,0.2805
|
|
||||||
2023-11-30,118940.301,1264011.9387,0.3097
|
|
||||||
2023-12-31,70162.1121,1334174.0508,0.3269
|
|
||||||
2024-01-31,736085.356,2070259.4068,0.5072
|
|
||||||
2024-02-29,-39545.523,2030713.8838,0.4975
|
|
||||||
2024-03-31,75590.692,2106304.5758,0.516
|
|
||||||
2024-04-30,316241.771,2422546.3468,0.5935
|
|
||||||
2024-05-31,-81607.349,2340938.9978,0.5735
|
|
||||||
2024-06-30,-252128.4517,2088810.5462,0.5118
|
|
||||||
2024-07-31,-12000.305,2076810.2412,0.5088
|
|
||||||
2024-08-31,240549.0254,2317359.2666,0.5678
|
|
||||||
2024-09-30,-2936336.049,-618976.7824,0.1517
|
|
||||||
2024-10-31,-533999.4854,-1152976.2678,0.2825
|
|
||||||
2024-11-30,128929.4967,-1024046.7711,0.2509
|
|
||||||
2024-12-31,51631.744,-972415.0271,0.2382
|
|
||||||
2025-01-31,-93297.413,-1065712.4401,0.2611
|
|
||||||
2025-02-28,352009.6074,-713702.8327,0.1749
|
|
||||||
2025-03-31,-42189.2219,-755892.0546,0.1852
|
|
||||||
2025-04-30,38161.8849,-717730.1697,0.1758
|
|
||||||
2025-05-31,55149.856,-662580.3137,0.1623
|
|
||||||
2025-06-30,972670.348,310090.0343,0.076
|
|
||||||
2025-07-31,31815.726,341905.7603,0.0838
|
|
||||||
2025-08-31,-188855.68,153050.0803,0.0375
|
|
||||||
2025-09-30,13670.993,166721.0733,0.0408
|
|
||||||
2025-10-31,-166721.0733,0.0,0.0
|
|
||||||
|
|
|
@ -24,14 +24,26 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 3,
|
||||||
"id": "d43b725e",
|
"id": "d43b725e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"push_file('carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n",
|
"push_file('repair_challenge/alpha_5%/carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n",
|
||||||
"push_file('carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n",
|
"push_file('repair_challenge/alpha_5%/carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n",
|
||||||
"push_file('carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')"
|
"push_file('repair_challenge/alpha_5%/carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')\n",
|
||||||
|
"push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "d9b0290a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"push_file('AUM_repair_audit.csv', 'projet-bdc-carmignac-g3//paco/AUM_repair_audit.csv')\n",
|
||||||
|
"push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -1598,7 +1598,8 @@ def main():
|
||||||
html = build_html(analytics, surgery, scores, mapping,
|
html = build_html(analytics, surgery, scores, mapping,
|
||||||
df_err_isin=df_err_isin, df_err_agg=df_err_agg)
|
df_err_isin=df_err_isin, df_err_agg=df_err_agg)
|
||||||
|
|
||||||
out_path = args.out
|
out_path = "../" + args.out
|
||||||
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
||||||
with open(out_path, "w", encoding="utf-8") as f:
|
with open(out_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
print(f"\n[Report] Written to → {out_path}")
|
print(f"\n[Report] Written to → {out_path}")
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ import s3fs
|
||||||
# PARAMÈTRES
|
# PARAMÈTRES
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
ALPHA = 0.05 # tolérance réconciliation : 5% du stock à t
|
ALPHA = 0.05 # tolérance réconciliation : 5% du stock à t
|
||||||
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
|
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1 — 0 pour les heads de test, 5e6 en prod
|
||||||
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
|
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
|
||||||
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
|
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
|
||||||
MAX_SURGERY_LOOKBACK = 6 # remonter jusqu'à 6 mois en arrière pour trouver un candidat
|
MAX_SURGERY_LOOKBACK = 6 # remonter jusqu'à 6 mois en arrière pour trouver un candidat
|
||||||
|
|
@ -61,6 +61,7 @@ def load_broken_months(broken_months_path):
|
||||||
If the file does not exist or is empty, returns two empty sets.
|
If the file does not exist or is empty, returns two empty sets.
|
||||||
"""
|
"""
|
||||||
if not broken_months_path or not os.path.exists(broken_months_path):
|
if not broken_months_path or not os.path.exists(broken_months_path):
|
||||||
|
print("Could not find the path")
|
||||||
return set(), set()
|
return set(), set()
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(broken_months_path, parse_dates=["date"])
|
df = pd.read_csv(broken_months_path, parse_dates=["date"])
|
||||||
|
|
@ -772,7 +773,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
|
||||||
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
|
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
|
||||||
if not df_scores.empty:
|
if not df_scores.empty:
|
||||||
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
|
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
|
||||||
df_scores.to_csv(f"/mnt/user-data/outputs/{out_prefix}_scores.csv", index=False)
|
df_scores.to_csv(f"repair_challenge/repair_results/{out_prefix}_scores.csv", index=False)
|
||||||
|
|
||||||
# Mapping history
|
# Mapping history
|
||||||
rows_m = []
|
rows_m = []
|
||||||
|
|
@ -783,12 +784,12 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
|
||||||
df_mapping = pd.DataFrame(rows_m) if rows_m else pd.DataFrame(columns=['date', 'reg_orig', 'reg_used', 'changed'])
|
df_mapping = pd.DataFrame(rows_m) if rows_m else pd.DataFrame(columns=['date', 'reg_orig', 'reg_used', 'changed'])
|
||||||
if not df_mapping.empty:
|
if not df_mapping.empty:
|
||||||
df_mapping = df_mapping.sort_values(['date', 'reg_orig'])
|
df_mapping = df_mapping.sort_values(['date', 'reg_orig'])
|
||||||
df_mapping.to_csv(f"repair_results/{out_prefix}_mapping.csv", index=False)
|
df_mapping.to_csv(f"repair_challenge/repair_results/{out_prefix}_mapping.csv", index=False)
|
||||||
|
|
||||||
# Surgery log
|
# Surgery log
|
||||||
if surgery_log:
|
if surgery_log:
|
||||||
df_surgery = pd.DataFrame(surgery_log).sort_values('date')
|
df_surgery = pd.DataFrame(surgery_log).sort_values('date')
|
||||||
df_surgery.to_csv(f"repair_results/{out_prefix}_surgery_log.csv", index=False)
|
df_surgery.to_csv(f"repair_challenge/repair_results/{out_prefix}_surgery_log.csv", index=False)
|
||||||
print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
|
print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
|
||||||
else:
|
else:
|
||||||
print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
|
print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
|
||||||
|
|
@ -801,7 +802,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
# 8. PIPELINE PRINCIPAL
|
# 8. PIPELINE PRINCIPAL
|
||||||
# ─────────────────────────────────────────────
|
# ─────────────────────────────────────────────
|
||||||
def run_pipeline(broken_months_path="alpha_5%/carmignac_broken_months.csv"):
|
def run_pipeline(broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv"):
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
|
print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
@ -864,5 +865,5 @@ def run_pipeline(broken_months_path="alpha_5%/carmignac_broken_months.csv"):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
|
df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
|
||||||
broken_months_path="carmignac_broken_months.csv" # optional
|
broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv" # optional
|
||||||
)
|
)
|
||||||
|
|
|
||||||
12319
repair_challenge/repair_results/carmignac_report.html
Normal file
12319
repair_challenge/repair_results/carmignac_report.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user