Final version of the cleaning steps

This commit is contained in:
Paco GOZE 2026-04-07 07:52:43 +00:00
parent 53668dd6ee
commit c2efab321b
9 changed files with 12344 additions and 73112 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,131 +0,0 @@
date,residual_agg,stock_error_agg,stock_error_agg_pct
2015-01-31,0.0,148292.6214,0.0363
2015-02-28,-71073.9175,77218.7039,0.0189
2015-03-31,-38212.0374,39006.6664,0.0096
2015-04-30,-93289.8893,-54283.2228,0.0133
2015-05-31,80779.4715,26496.2486,0.0065
2015-06-30,210846.4765,237342.7252,0.0581
2015-07-31,-237021.1601,321.565,0.0001
2015-08-31,52430.1346,52751.6996,0.0129
2015-09-30,115091.9839,167843.6835,0.0411
2015-10-31,362979.7972,530823.4807,0.1301
2015-11-30,89787.3998,620610.8804,0.1521
2015-12-31,-548821.2591,71789.6214,0.0176
2016-01-31,114438.8471,186228.4684,0.0456
2016-02-29,-94160.3412,92068.1273,0.0226
2016-03-31,197803.9351,289872.0624,0.071
2016-04-30,75620.3391,365492.4015,0.0895
2016-05-31,1277.0285,366769.4301,0.0899
2016-06-30,28344.6474,395114.0775,0.0968
2016-07-31,-455791.3682,-60677.2908,0.0149
2016-08-31,118896.6804,58219.3897,0.0143
2016-09-30,-433713.5055,-375494.1158,0.092
2016-10-31,87253.3822,-288240.7336,0.0706
2016-11-30,197373.3618,-90867.3718,0.0223
2016-12-31,299519.7371,208652.3652,0.0511
2017-01-31,-67316.5443,141335.8209,0.0346
2017-02-28,-294805.5376,-153469.7167,0.0376
2017-03-31,-363088.0453,-516557.762,0.1266
2017-04-30,187743.9712,-328813.7908,0.0806
2017-05-31,9241.0742,-319572.7166,0.0783
2017-06-30,-33028.3212,-352601.0378,0.0864
2017-07-31,-218597.21,-571198.2478,0.1399
2017-08-31,273953.3009,-297244.9469,0.0728
2017-09-30,115964.9839,-181279.963,0.0444
2017-10-31,-23621.0098,-204900.9728,0.0502
2017-11-30,95815.8482,-109085.1246,0.0267
2017-12-31,-121404.417,-230489.5416,0.0565
2018-01-31,1075741.0088,845251.4672,0.2071
2018-02-28,390784.0012,1236035.4684,0.3028
2018-03-31,455651.3899,1691686.8583,0.4145
2018-04-30,-374980.7,1316706.1583,0.3226
2018-05-31,209864.386,1526570.5443,0.374
2018-06-30,111872.6628,1638443.207,0.4014
2018-07-31,-158631.7954,1479811.4116,0.3626
2018-08-31,300388.692,1780200.1036,0.4362
2018-09-30,-240137.9416,1540062.1621,0.3773
2018-10-31,-393725.0807,1146337.0813,0.2809
2018-11-30,-217887.9315,928449.1499,0.2275
2018-12-31,577035.3311,1505484.4809,0.3688
2019-01-31,-374117.7166,1131366.7643,0.2772
2019-02-28,214587.8043,1345954.5686,0.3298
2019-03-31,-168958.6343,1176995.9343,0.2884
2019-04-30,459977.1588,1636973.0931,0.4011
2019-05-31,-67477.1479,1569495.9453,0.3845
2019-06-30,-535961.896,1033534.0493,0.2532
2019-07-31,-61034.0471,972500.0021,0.2383
2019-08-31,17046.9276,989546.9297,0.2424
2019-09-30,-72796.0114,916750.9183,0.2246
2019-10-31,809108.5438,1725859.4622,0.4228
2019-11-30,43679.2428,1769538.705,0.4335
2019-12-31,1288397.5385,3057936.2435,0.7492
2020-01-31,-973899.4752,2084036.7683,0.5106
2020-02-29,864992.3304,2949029.0987,0.7225
2020-03-31,-802670.8766,2146358.2221,0.5259
2020-04-30,-245269.422,1901088.8001,0.4658
2020-05-31,-262548.2012,1638540.5989,0.4014
2020-06-30,577667.5612,2216208.1601,0.543
2020-07-31,148208.9581,2364417.1182,0.5793
2020-08-31,-110665.7234,2253751.3948,0.5522
2020-09-30,-93719.2628,2160032.132,0.5292
2020-10-31,360937.203,2520969.335,0.6176
2020-11-30,-209972.8202,2310996.5148,0.5662
2020-12-31,204577.9672,2515574.482,0.6163
2021-01-31,114754.1508,2630328.6328,0.6444
2021-02-28,-369099.3196,2261229.3132,0.554
2021-03-31,285801.7938,2547031.1069,0.624
2021-04-30,-525497.0146,2021534.0923,0.4953
2021-05-31,155798.0056,2177332.098,0.5334
2021-06-30,-596321.249,1581010.849,0.3873
2021-07-31,651201.5417,2232212.3907,0.5469
2021-08-31,196268.6348,2428481.0255,0.595
2021-09-30,-427350.6319,2001130.3936,0.4903
2021-10-31,-612699.7898,1388430.6037,0.3402
2021-11-30,-245857.591,1142573.0127,0.2799
2021-12-31,166235.742,1308808.7547,0.3207
2022-01-31,1057051.561,2365860.3157,0.5796
2022-02-28,247946.452,2613806.7677,0.6404
2022-03-31,375938.62,2989745.3877,0.7325
2022-04-30,-1187760.361,1801985.0267,0.4415
2022-05-31,85351.637,1887336.6637,0.4624
2022-06-30,-1119375.9,767960.7637,0.1882
2022-07-31,472271.327,1240232.0907,0.3039
2022-08-31,-74701.74,1165530.3507,0.2856
2022-09-30,293426.2139,1458956.5646,0.3574
2022-10-31,-116613.2208,1342343.3438,0.3289
2022-11-30,-274268.7771,1068074.5667,0.2617
2022-12-31,117575.457,1185650.0237,0.2905
2023-01-31,435788.475,1621438.4987,0.3973
2023-02-28,144058.704,1765497.2027,0.4325
2023-03-31,18136.618,1783633.8207,0.437
2023-04-30,56736.261,1840370.0817,0.4509
2023-05-31,178492.877,2018862.9587,0.4946
2023-06-30,33137.773,2052000.7317,0.5027
2023-07-31,-931401.398,1120599.3337,0.2745
2023-08-31,-195355.266,925244.0677,0.2267
2023-09-30,160732.769,1085976.8367,0.2661
2023-10-31,59094.801,1145071.6377,0.2805
2023-11-30,118940.301,1264011.9387,0.3097
2023-12-31,70162.1121,1334174.0508,0.3269
2024-01-31,736085.356,2070259.4068,0.5072
2024-02-29,-39545.523,2030713.8838,0.4975
2024-03-31,75590.692,2106304.5758,0.516
2024-04-30,316241.771,2422546.3468,0.5935
2024-05-31,-81607.349,2340938.9978,0.5735
2024-06-30,-252128.4517,2088810.5462,0.5118
2024-07-31,-12000.305,2076810.2412,0.5088
2024-08-31,240549.0254,2317359.2666,0.5678
2024-09-30,-2936336.049,-618976.7824,0.1517
2024-10-31,-533999.4854,-1152976.2678,0.2825
2024-11-30,128929.4967,-1024046.7711,0.2509
2024-12-31,51631.744,-972415.0271,0.2382
2025-01-31,-93297.413,-1065712.4401,0.2611
2025-02-28,352009.6074,-713702.8327,0.1749
2025-03-31,-42189.2219,-755892.0546,0.1852
2025-04-30,38161.8849,-717730.1697,0.1758
2025-05-31,55149.856,-662580.3137,0.1623
2025-06-30,972670.348,310090.0343,0.076
2025-07-31,31815.726,341905.7603,0.0838
2025-08-31,-188855.68,153050.0803,0.0375
2025-09-30,13670.993,166721.0733,0.0408
2025-10-31,-166721.0733,0.0,0.0
1 date residual_agg stock_error_agg stock_error_agg_pct
2 2015-01-31 0.0 148292.6214 0.0363
3 2015-02-28 -71073.9175 77218.7039 0.0189
4 2015-03-31 -38212.0374 39006.6664 0.0096
5 2015-04-30 -93289.8893 -54283.2228 0.0133
6 2015-05-31 80779.4715 26496.2486 0.0065
7 2015-06-30 210846.4765 237342.7252 0.0581
8 2015-07-31 -237021.1601 321.565 0.0001
9 2015-08-31 52430.1346 52751.6996 0.0129
10 2015-09-30 115091.9839 167843.6835 0.0411
11 2015-10-31 362979.7972 530823.4807 0.1301
12 2015-11-30 89787.3998 620610.8804 0.1521
13 2015-12-31 -548821.2591 71789.6214 0.0176
14 2016-01-31 114438.8471 186228.4684 0.0456
15 2016-02-29 -94160.3412 92068.1273 0.0226
16 2016-03-31 197803.9351 289872.0624 0.071
17 2016-04-30 75620.3391 365492.4015 0.0895
18 2016-05-31 1277.0285 366769.4301 0.0899
19 2016-06-30 28344.6474 395114.0775 0.0968
20 2016-07-31 -455791.3682 -60677.2908 0.0149
21 2016-08-31 118896.6804 58219.3897 0.0143
22 2016-09-30 -433713.5055 -375494.1158 0.092
23 2016-10-31 87253.3822 -288240.7336 0.0706
24 2016-11-30 197373.3618 -90867.3718 0.0223
25 2016-12-31 299519.7371 208652.3652 0.0511
26 2017-01-31 -67316.5443 141335.8209 0.0346
27 2017-02-28 -294805.5376 -153469.7167 0.0376
28 2017-03-31 -363088.0453 -516557.762 0.1266
29 2017-04-30 187743.9712 -328813.7908 0.0806
30 2017-05-31 9241.0742 -319572.7166 0.0783
31 2017-06-30 -33028.3212 -352601.0378 0.0864
32 2017-07-31 -218597.21 -571198.2478 0.1399
33 2017-08-31 273953.3009 -297244.9469 0.0728
34 2017-09-30 115964.9839 -181279.963 0.0444
35 2017-10-31 -23621.0098 -204900.9728 0.0502
36 2017-11-30 95815.8482 -109085.1246 0.0267
37 2017-12-31 -121404.417 -230489.5416 0.0565
38 2018-01-31 1075741.0088 845251.4672 0.2071
39 2018-02-28 390784.0012 1236035.4684 0.3028
40 2018-03-31 455651.3899 1691686.8583 0.4145
41 2018-04-30 -374980.7 1316706.1583 0.3226
42 2018-05-31 209864.386 1526570.5443 0.374
43 2018-06-30 111872.6628 1638443.207 0.4014
44 2018-07-31 -158631.7954 1479811.4116 0.3626
45 2018-08-31 300388.692 1780200.1036 0.4362
46 2018-09-30 -240137.9416 1540062.1621 0.3773
47 2018-10-31 -393725.0807 1146337.0813 0.2809
48 2018-11-30 -217887.9315 928449.1499 0.2275
49 2018-12-31 577035.3311 1505484.4809 0.3688
50 2019-01-31 -374117.7166 1131366.7643 0.2772
51 2019-02-28 214587.8043 1345954.5686 0.3298
52 2019-03-31 -168958.6343 1176995.9343 0.2884
53 2019-04-30 459977.1588 1636973.0931 0.4011
54 2019-05-31 -67477.1479 1569495.9453 0.3845
55 2019-06-30 -535961.896 1033534.0493 0.2532
56 2019-07-31 -61034.0471 972500.0021 0.2383
57 2019-08-31 17046.9276 989546.9297 0.2424
58 2019-09-30 -72796.0114 916750.9183 0.2246
59 2019-10-31 809108.5438 1725859.4622 0.4228
60 2019-11-30 43679.2428 1769538.705 0.4335
61 2019-12-31 1288397.5385 3057936.2435 0.7492
62 2020-01-31 -973899.4752 2084036.7683 0.5106
63 2020-02-29 864992.3304 2949029.0987 0.7225
64 2020-03-31 -802670.8766 2146358.2221 0.5259
65 2020-04-30 -245269.422 1901088.8001 0.4658
66 2020-05-31 -262548.2012 1638540.5989 0.4014
67 2020-06-30 577667.5612 2216208.1601 0.543
68 2020-07-31 148208.9581 2364417.1182 0.5793
69 2020-08-31 -110665.7234 2253751.3948 0.5522
70 2020-09-30 -93719.2628 2160032.132 0.5292
71 2020-10-31 360937.203 2520969.335 0.6176
72 2020-11-30 -209972.8202 2310996.5148 0.5662
73 2020-12-31 204577.9672 2515574.482 0.6163
74 2021-01-31 114754.1508 2630328.6328 0.6444
75 2021-02-28 -369099.3196 2261229.3132 0.554
76 2021-03-31 285801.7938 2547031.1069 0.624
77 2021-04-30 -525497.0146 2021534.0923 0.4953
78 2021-05-31 155798.0056 2177332.098 0.5334
79 2021-06-30 -596321.249 1581010.849 0.3873
80 2021-07-31 651201.5417 2232212.3907 0.5469
81 2021-08-31 196268.6348 2428481.0255 0.595
82 2021-09-30 -427350.6319 2001130.3936 0.4903
83 2021-10-31 -612699.7898 1388430.6037 0.3402
84 2021-11-30 -245857.591 1142573.0127 0.2799
85 2021-12-31 166235.742 1308808.7547 0.3207
86 2022-01-31 1057051.561 2365860.3157 0.5796
87 2022-02-28 247946.452 2613806.7677 0.6404
88 2022-03-31 375938.62 2989745.3877 0.7325
89 2022-04-30 -1187760.361 1801985.0267 0.4415
90 2022-05-31 85351.637 1887336.6637 0.4624
91 2022-06-30 -1119375.9 767960.7637 0.1882
92 2022-07-31 472271.327 1240232.0907 0.3039
93 2022-08-31 -74701.74 1165530.3507 0.2856
94 2022-09-30 293426.2139 1458956.5646 0.3574
95 2022-10-31 -116613.2208 1342343.3438 0.3289
96 2022-11-30 -274268.7771 1068074.5667 0.2617
97 2022-12-31 117575.457 1185650.0237 0.2905
98 2023-01-31 435788.475 1621438.4987 0.3973
99 2023-02-28 144058.704 1765497.2027 0.4325
100 2023-03-31 18136.618 1783633.8207 0.437
101 2023-04-30 56736.261 1840370.0817 0.4509
102 2023-05-31 178492.877 2018862.9587 0.4946
103 2023-06-30 33137.773 2052000.7317 0.5027
104 2023-07-31 -931401.398 1120599.3337 0.2745
105 2023-08-31 -195355.266 925244.0677 0.2267
106 2023-09-30 160732.769 1085976.8367 0.2661
107 2023-10-31 59094.801 1145071.6377 0.2805
108 2023-11-30 118940.301 1264011.9387 0.3097
109 2023-12-31 70162.1121 1334174.0508 0.3269
110 2024-01-31 736085.356 2070259.4068 0.5072
111 2024-02-29 -39545.523 2030713.8838 0.4975
112 2024-03-31 75590.692 2106304.5758 0.516
113 2024-04-30 316241.771 2422546.3468 0.5935
114 2024-05-31 -81607.349 2340938.9978 0.5735
115 2024-06-30 -252128.4517 2088810.5462 0.5118
116 2024-07-31 -12000.305 2076810.2412 0.5088
117 2024-08-31 240549.0254 2317359.2666 0.5678
118 2024-09-30 -2936336.049 -618976.7824 0.1517
119 2024-10-31 -533999.4854 -1152976.2678 0.2825
120 2024-11-30 128929.4967 -1024046.7711 0.2509
121 2024-12-31 51631.744 -972415.0271 0.2382
122 2025-01-31 -93297.413 -1065712.4401 0.2611
123 2025-02-28 352009.6074 -713702.8327 0.1749
124 2025-03-31 -42189.2219 -755892.0546 0.1852
125 2025-04-30 38161.8849 -717730.1697 0.1758
126 2025-05-31 55149.856 -662580.3137 0.1623
127 2025-06-30 972670.348 310090.0343 0.076
128 2025-07-31 31815.726 341905.7603 0.0838
129 2025-08-31 -188855.68 153050.0803 0.0375
130 2025-09-30 13670.993 166721.0733 0.0408
131 2025-10-31 -166721.0733 0.0 0.0

View File

@ -24,14 +24,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"id": "d43b725e", "id": "d43b725e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"push_file('carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n", "push_file('repair_challenge/alpha_5%/carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n",
"push_file('carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n", "push_file('repair_challenge/alpha_5%/carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n",
"push_file('carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')" "push_file('repair_challenge/alpha_5%/carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')\n",
"push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d9b0290a",
"metadata": {},
"outputs": [],
"source": [
"push_file('AUM_repair_audit.csv', 'projet-bdc-carmignac-g3//paco/AUM_repair_audit.csv')\n",
"push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')"
] ]
} }
], ],

View File

@ -1598,7 +1598,8 @@ def main():
html = build_html(analytics, surgery, scores, mapping, html = build_html(analytics, surgery, scores, mapping,
df_err_isin=df_err_isin, df_err_agg=df_err_agg) df_err_isin=df_err_isin, df_err_agg=df_err_agg)
out_path = args.out out_path = "../" + args.out
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f: with open(out_path, "w", encoding="utf-8") as f:
f.write(html) f.write(html)
print(f"\n[Report] Written to → {out_path}") print(f"\n[Report] Written to → {out_path}")

View File

@ -61,6 +61,7 @@ def load_broken_months(broken_months_path):
If the file does not exist or is empty, returns two empty sets. If the file does not exist or is empty, returns two empty sets.
""" """
if not broken_months_path or not os.path.exists(broken_months_path): if not broken_months_path or not os.path.exists(broken_months_path):
print("Could not find the path")
return set(), set() return set(), set()
try: try:
df = pd.read_csv(broken_months_path, parse_dates=["date"]) df = pd.read_csv(broken_months_path, parse_dates=["date"])
@ -772,7 +773,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score']) df_scores = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['date', 'reg_id', 'score'])
if not df_scores.empty: if not df_scores.empty:
df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False]) df_scores = df_scores.sort_values(['date', 'score'], ascending=[True, False])
df_scores.to_csv(f"/mnt/user-data/outputs/{out_prefix}_scores.csv", index=False) df_scores.to_csv(f"repair_challenge/repair_results/{out_prefix}_scores.csv", index=False)
# Mapping history # Mapping history
rows_m = [] rows_m = []
@ -783,12 +784,12 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
df_mapping = pd.DataFrame(rows_m) if rows_m else pd.DataFrame(columns=['date', 'reg_orig', 'reg_used', 'changed']) df_mapping = pd.DataFrame(rows_m) if rows_m else pd.DataFrame(columns=['date', 'reg_orig', 'reg_used', 'changed'])
if not df_mapping.empty: if not df_mapping.empty:
df_mapping = df_mapping.sort_values(['date', 'reg_orig']) df_mapping = df_mapping.sort_values(['date', 'reg_orig'])
df_mapping.to_csv(f"repair_results/{out_prefix}_mapping.csv", index=False) df_mapping.to_csv(f"repair_challenge/repair_results/{out_prefix}_mapping.csv", index=False)
# Surgery log # Surgery log
if surgery_log: if surgery_log:
df_surgery = pd.DataFrame(surgery_log).sort_values('date') df_surgery = pd.DataFrame(surgery_log).sort_values('date')
df_surgery.to_csv(f"repair_results/{out_prefix}_surgery_log.csv", index=False) df_surgery.to_csv(f"repair_challenge/repair_results/{out_prefix}_surgery_log.csv", index=False)
print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.") print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
else: else:
print("\n[Export] Aucune chirurgie effectuée sur ce subset.") print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
@ -801,7 +802,7 @@ def export_results(scores_history, mapping_history, surgery_log, all_months, out
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
# 8. PIPELINE PRINCIPAL # 8. PIPELINE PRINCIPAL
# ───────────────────────────────────────────── # ─────────────────────────────────────────────
def run_pipeline(broken_months_path="alpha_5%/carmignac_broken_months.csv"): def run_pipeline(broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv"):
print("=" * 60) print("=" * 60)
print("CARMIGNAC — Pipeline de réparation des Registrar IDs") print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
print("=" * 60) print("=" * 60)
@ -864,5 +865,5 @@ def run_pipeline(broken_months_path="alpha_5%/carmignac_broken_months.csv"):
if __name__ == "__main__": if __name__ == "__main__":
df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline( df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
broken_months_path="carmignac_broken_months.csv" # optional broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv" # optional
) )

File diff suppressed because one or more lines are too long