paco-dev #2
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
data/
|
||||||
|
data_exploration/
|
||||||
|
*.csv
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [],
|
|
||||||
"metadata": {},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
201
dataloader.ipynb
201
dataloader.ipynb
|
|
@ -1,201 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "ff2261fb-9516-4410-b42d-3acc8dc1a460",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n",
|
|
||||||
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n",
|
|
||||||
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n",
|
|
||||||
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
|
|
||||||
"fs = s3fs.S3FileSystem(\n",
|
|
||||||
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
|
||||||
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
|
||||||
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
|
||||||
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "dc546698-76dc-4eaf-b9e2-7602953bf8f5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>Morningstar Global Asset Class</th>\n",
|
|
||||||
" <th>Morningstar Global Category</th>\n",
|
|
||||||
" <th>Morningstar Category</th>\n",
|
|
||||||
" <th>Combined Country</th>\n",
|
|
||||||
" <th>Combined Channel Type</th>\n",
|
|
||||||
" <th>Combined Type</th>\n",
|
|
||||||
" <th>Month/Year (Record Date)</th>\n",
|
|
||||||
" <th>Combined Net Assets</th>\n",
|
|
||||||
" <th>Combined Net Sales</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>Luxembourg</td>\n",
|
|
||||||
" <td>Proprietary</td>\n",
|
|
||||||
" <td>Domestic</td>\n",
|
|
||||||
" <td>Jan 2015</td>\n",
|
|
||||||
" <td>11170000.0</td>\n",
|
|
||||||
" <td>799301.47</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>Luxembourg</td>\n",
|
|
||||||
" <td>Proprietary</td>\n",
|
|
||||||
" <td>Domestic</td>\n",
|
|
||||||
" <td>Feb 2015</td>\n",
|
|
||||||
" <td>21210000.0</td>\n",
|
|
||||||
" <td>8922456.46</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>Luxembourg</td>\n",
|
|
||||||
" <td>Proprietary</td>\n",
|
|
||||||
" <td>Domestic</td>\n",
|
|
||||||
" <td>Mar 2015</td>\n",
|
|
||||||
" <td>23670000.0</td>\n",
|
|
||||||
" <td>1718627.81</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>Luxembourg</td>\n",
|
|
||||||
" <td>Proprietary</td>\n",
|
|
||||||
" <td>Domestic</td>\n",
|
|
||||||
" <td>Apr 2015</td>\n",
|
|
||||||
" <td>22720000.0</td>\n",
|
|
||||||
" <td>-670097.35</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>Luxembourg</td>\n",
|
|
||||||
" <td>Proprietary</td>\n",
|
|
||||||
" <td>Domestic</td>\n",
|
|
||||||
" <td>May 2015</td>\n",
|
|
||||||
" <td>23550000.0</td>\n",
|
|
||||||
" <td>204625.93</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" Morningstar Global Asset Class Morningstar Global Category \\\n",
|
|
||||||
"0 NaN NaN \n",
|
|
||||||
"1 NaN NaN \n",
|
|
||||||
"2 NaN NaN \n",
|
|
||||||
"3 NaN NaN \n",
|
|
||||||
"4 NaN NaN \n",
|
|
||||||
"\n",
|
|
||||||
" Morningstar Category Combined Country Combined Channel Type Combined Type \\\n",
|
|
||||||
"0 NaN Luxembourg Proprietary Domestic \n",
|
|
||||||
"1 NaN Luxembourg Proprietary Domestic \n",
|
|
||||||
"2 NaN Luxembourg Proprietary Domestic \n",
|
|
||||||
"3 NaN Luxembourg Proprietary Domestic \n",
|
|
||||||
"4 NaN Luxembourg Proprietary Domestic \n",
|
|
||||||
"\n",
|
|
||||||
" Month/Year (Record Date) Combined Net Assets Combined Net Sales \n",
|
|
||||||
"0 Jan 2015 11170000.0 799301.47 \n",
|
|
||||||
"1 Feb 2015 21210000.0 8922456.46 \n",
|
|
||||||
"2 Mar 2015 23670000.0 1718627.81 \n",
|
|
||||||
"3 Apr 2015 22720000.0 -670097.35 \n",
|
|
||||||
"4 May 2015 23550000.0 204625.93 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"with fs.open('projet-bdc-data/carmignac/Data Modélisation/market data/broadridge_Global Market data MS.csv', 'rb') as f:\n",
|
|
||||||
" df = pd.read_csv(f)\n",
|
|
||||||
"\n",
|
|
||||||
"df.head()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "7494bffd-83b5-42e2-b17e-d04c90f3b59e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.13.8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
424822
dataset_features.csv
Normal file
424822
dataset_features.csv
Normal file
File diff suppressed because it is too large
Load Diff
1193
notebooks/aum_flows_analysis.ipynb
Normal file
1193
notebooks/aum_flows_analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
872
notebooks/competitors_analysis.ipynb
Normal file
872
notebooks/competitors_analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
2837
notebooks/dataloader.ipynb
Normal file
2837
notebooks/dataloader.ipynb
Normal file
File diff suppressed because one or more lines are too long
70
notebooks/push_s3.ipynb
Normal file
70
notebooks/push_s3.ipynb
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d2701d07",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Helper notebook to allow pushing data on S3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "5c8fc6c5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import s3fs\n",
|
||||||
|
"\n",
|
||||||
|
"def push_file(local_path, s3_path):\n",
|
||||||
|
" fs = s3fs.S3FileSystem(\n",
|
||||||
|
" client_kwargs={'endpoint_url': 'https://' + 'minio-simple.lab.groupe-genes.fr'},\n",
|
||||||
|
" key=os.environ[\"AWS_ACCESS_KEY_ID\"],\n",
|
||||||
|
" secret=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n",
|
||||||
|
" token=os.environ[\"AWS_SESSION_TOKEN\"]\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" with open(local_path, 'rb') as local_f, fs.open(s3_path, 'wb') as s3_f:\n",
|
||||||
|
" s3_f.write(local_f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d43b725e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"push_file('repair_challenge/alpha_5%/carmignac_broken_months.csv', 'projet-bdc-carmignac-g3//paco/carmignac_broken_months.csv')\n",
|
||||||
|
"push_file('repair_challenge/alpha_5%/carmignac_error_account_agg.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account_agg.csv')\n",
|
||||||
|
"push_file('repair_challenge/alpha_5%/carmignac_error_account.csv', 'projet-bdc-carmignac-g3//paco/carmignac_error_account.csv')\n",
|
||||||
|
"push_file('AUM_repaired.csv', 'projet-bdc-carmignac-g3//paco/AUM_repaired.csv')\n",
|
||||||
|
"push_file('AUM_paths.csv', 'projet-bdc-carmignac-g3//paco/AUM_paths.csv')\n",
|
||||||
|
"push_file('AUM_repair_audit.csv', 'projet-bdc-carmignac-g3//paco/AUM_repair_audit.csv')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
28
peers_summary.csv
Normal file
28
peers_summary.csv
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
strategy,n_carmignac_sc,n_competitors,n_index_funds,ms_categories,broad_category
|
||||||
|
CAD,2,27,2,"EAA Fund Asia ex-Japan Equity, EAA Fund Asia ex-Japan Small/Mid-Cap Equity, EAA Fund Asia-Pacific Equity, EAA Fund Asia-Pacific ex-Japan Equity, EAA Fund Global Emerging Markets ex-China Equity",Equity
|
||||||
|
CARE,2,22,0,"EAA Fund Equity Market Neutral EUR, EAA Fund Long/Short Equity - Global, EAA Fund Long/Short Equity - Europe, EAA Fund Macro Trading EUR",Alternative
|
||||||
|
CCNE,1,28,0,"EAA Fund Greater China Equity, EAA Fund China Equity, EAA Fund China Equity - A Shares",Equity
|
||||||
|
CCR,3,36,1,"EAA Fund EUR Corporate Bond, EAA Fund EUR Flexible Bond, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund EUR High Yield Bond, EAA Fund Global Corporate Bond - EUR Hedged",Fixed Income
|
||||||
|
CEMD,1,34,0,"EAA Fund Global Emerging Markets Bond, EAA Fund Global Emerging Markets Bond - EUR Hedged, EAA Fund Other Bond, EAA Fund Global Emerging Markets Bond - Local Currency, Global Emerging Markets Bond, Global Emerging Markets Bond - EUR Hedged, Global Emerging Markets Bond - Local Currency",Fixed Income
|
||||||
|
CEMP,2,11,0,"EAA Fund Global Emerging Markets Allocation, EAA Fund Other Allocation, EAA Fund Asia Allocation, EAA Fund Greater China Allocation, Global Emerging Markets Allocation",Allocation
|
||||||
|
CE,3,40,1,"EAA Fund Global Emerging Markets Equity, Global Emerging Markets Equity",Equity
|
||||||
|
CFB,2,20,1,"EAA Fund EUR Flexible Bond, EAA Fund EUR Diversified Bond, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund Global Diversified Bond - EUR Hedged, EUR Flexible Bond",Fixed Income
|
||||||
|
CFG,1,10,0,"EAA Fund Europe ex-UK Small/Mid-Cap Equity, EAA Fund Europe Flex-Cap Equity, EAA Fund Europe Mid-Cap Equity, EAA Fund Europe Small-Cap Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Eurozone Mid-Cap Equity, EAA Fund Global Flex-Cap Equity, EAA Fund Global Large-Cap Growth Equity",Equity
|
||||||
|
CGB,2,35,2,"EAA Fund Global Diversified Bond, EAA Fund Global Flexible Bond - EUR Hedged, Global Diversified Bond, EAA Fund Global Flexible Bond, EAA Fund Other Bond, EAA Fund EUR Diversified Bond - Short Term, EAA Fund EUR Flexible Bond, EAA Fund Global Government Bond, EAA Fund Global Corporate Bond - EUR Hedged, EAA Fund Global Diversified Bond - EUR Hedged, EAA Fund Global Government Bond - EUR Hedged",Fixed Income
|
||||||
|
CGC,1,22,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Other Equity, EAA Fund Global Large-Cap Blend Equity",Equity
|
||||||
|
CGE,2,52,0,"EAA Fund Europe Large-Cap Blend Equity, EAA Fund Europe Large-Cap Growth Equity, EAA Fund Europe Large-Cap Value Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Europe Flex-Cap Equity, EAA Fund Europe Equity Income, Europe Large-Cap Growth Equity",Equity
|
||||||
|
CHX,1,10,0,"EAA Fund Europe Large-Cap Blend Equity, EAA Fund Europe Mid-Cap Equity, EAA Fund Eurozone Flex-Cap Equity, EAA Fund Eurozone Large-Cap Equity, EAA Fund Global Large-Cap Blend Equity, EAA Fund Global Large-Cap Growth Equity, EAA Fund Other Equity, EAA Fund Sector Equity Consumer Goods & Services, EAA Fund Sector Equity Ecology",Equity
|
||||||
|
CIL,2,12,0,"EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Flexible Allocation, EAA Fund EUR Moderate Allocation - Global, EAA Fund EUR Cautious Allocation - Global, EUR Flexible Allocation - Global",Allocation
|
||||||
|
CI,3,28,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Global Large-Cap Value Equity, EAA Fund Global Large-Cap Blend Equity, EAA Fund Other Equity, EAA Fund Global Equity Income, EAA Fund Global Flex-Cap Equity, EAA Fund Europe Flex-Cap Equity",Equity
|
||||||
|
CMAP,1,21,0,"EAA Fund Event Driven, EAA Fund Relative Value Arbitrage",Alternative
|
||||||
|
CMA,1,4,0,EAA Fund Event Driven,Alternative
|
||||||
|
CPE,2,19,0,"EAA Fund EUR Moderate Allocation, EAA Fund EUR Cautious Allocation, EAA Fund EUR Flexible Allocation, EAA Fund EUR Aggressive Allocation, EAA Fund EUR Moderate Allocation - Global, EUR Moderate Allocation",Allocation
|
||||||
|
CPI,2,18,0,"EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Moderate Allocation - Global, EAA Fund EUR Flexible Allocation, EAA Fund EUR Cautious Allocation - Global, EAA Fund Other Allocation, EAA Fund USD Moderate Allocation, EAA Fund EUR Cautious Allocation, EAA Fund Macro Trading EUR, EAA Fund GBP Flexible Allocation, EAA Fund Global Inflation-Linked Bond - EUR Hedged, EAA Fund Commodities - Broad Basket",Allocation
|
||||||
|
CP,2,34,0,"EAA Fund EUR Moderate Allocation - Global, EAA Fund USD Moderate Allocation, EAA Fund EUR Flexible Allocation - Global, EAA Fund EUR Cautious Allocation - Global, EAA Fund EUR Aggressive Allocation - Global, EAA Fund EUR Cautious Allocation, EAA Fund EUR Flexible Allocation, EAA Fund EUR Diversified Bond, EAA Fund EUR Moderate Allocation, EUR Moderate Allocation - Global",Allocation
|
||||||
|
CS,2,27,2,"EAA Fund EUR Diversified Bond - Short Term, EAA Fund EUR Government Bond - Short Term, EAA Fund Global Flexible Bond - EUR Hedged, EAA Fund EUR Ultra Short-Term Bond, EAA Fund EUR Flexible Bond, EAA Fund EUR Corporate Bond - Short Term, EAA Fund EUR Diversified Bond, EAA Fund EUR Corporate Bond",Fixed Income
|
||||||
|
CTS,2,24,0,"EAA Fund Sector Equity Technology, EAA Fund US Flex-Cap Equity, Sector Equity Technology",Equity
|
||||||
|
PLSEE,2,27,0,"EAA Fund Long/Short Equity - Global, EAA Fund Equity Market Neutral EUR, EAA Fund Long/Short Equity - Europe, EAA Fund Long/Short Equity - Other, EAA Fund Europe Large-Cap Blend Equity",Equity
|
||||||
|
UKCEL,2,27,0,"EAA Fund Europe ex-UK Equity, EAA Fund Europe ex-UK Small/Mid-Cap Equity, EAA Fund Other Equity, EAA Fund Europe Large-Cap Blend Equity",Equity
|
||||||
|
UKCE,2,21,0,EAA Fund Global Emerging Markets Equity,Equity
|
||||||
|
UKCGB,5,26,0,"EAA Fund Global Flexible Bond - GBP Hedged, EAA Fund Global Flexible Bond, EAA Fund Global Diversified Bond, EAA Fund Global Diversified Bond - GBP Hedged, EAA Fund GBP Allocation 0-20% Equity",Fixed Income
|
||||||
|
UKCGEC,3,17,0,"EAA Fund Global Large-Cap Growth Equity, EAA Fund Global Large-Cap Blend Equity",Equity
|
||||||
|
199946
relative_performance.csv
Normal file
199946
relative_performance.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
src/__pycache__/feature_engineering.cpython-313.pyc
Normal file
BIN
src/__pycache__/feature_engineering.cpython-313.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/peers_loader.cpython-313.pyc
Normal file
BIN
src/__pycache__/peers_loader.cpython-313.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/predictive_model.cpython-313.pyc
Normal file
BIN
src/__pycache__/predictive_model.cpython-313.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/relative_performance.cpython-313.pyc
Normal file
BIN
src/__pycache__/relative_performance.cpython-313.pyc
Normal file
Binary file not shown.
328
src/repair_challenge/carmignac_analysis.py
Normal file
328
src/repair_challenge/carmignac_analysis.py
Normal file
|
|
@ -0,0 +1,328 @@
|
||||||
|
"""
|
||||||
|
Pipeline Results Analysis
|
||||||
|
=====================================================
|
||||||
|
Analyses the CSV outputs produced by carmignac_repair.py:
|
||||||
|
- carmignac_scores.csv (post-surgery score history)
|
||||||
|
- carmignac_mapping.csv (reg_id mapping history)
|
||||||
|
- carmignac_surgery_log.csv (surgery operations)
|
||||||
|
|
||||||
|
Produces a self-contained HTML report with interactive charts.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python carmignac_analysis.py
|
||||||
|
python carmignac_analysis.py --scores path/to/scores.csv \
|
||||||
|
--mapping path/to/mapping.csv \
|
||||||
|
--surgery path/to/surgery_log.csv \
|
||||||
|
--out report.html
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from helpers import build_html_repair
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# 1. LOAD & VALIDATE
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def load_outputs(
|
||||||
|
scores_path, mapping_path, surgery_path, err_isin_path=None, err_agg_path=None
|
||||||
|
):
|
||||||
|
scores = pd.read_csv(scores_path, parse_dates=["date"])
|
||||||
|
mapping = pd.read_csv(mapping_path, parse_dates=["date"])
|
||||||
|
surgery = pd.read_csv(surgery_path, parse_dates=["date"])
|
||||||
|
|
||||||
|
# Normalise dtypes
|
||||||
|
scores["reg_id"] = scores["reg_id"].astype(str)
|
||||||
|
mapping["reg_orig"] = mapping["reg_orig"].astype(str)
|
||||||
|
mapping["reg_used"] = mapping["reg_used"].astype(str)
|
||||||
|
mapping["changed"] = mapping["changed"].astype(bool)
|
||||||
|
surgery["reg_orig"] = surgery["reg_orig"].astype(str)
|
||||||
|
surgery["reg_from"] = surgery["reg_from"].astype(str)
|
||||||
|
surgery["reg_to"] = surgery["reg_to"].astype(str)
|
||||||
|
if "lookback_months" not in surgery.columns:
|
||||||
|
surgery["lookback_months"] = 1 # backwards compat
|
||||||
|
|
||||||
|
# Error account (optional)
|
||||||
|
err_isin = None
|
||||||
|
err_agg = None
|
||||||
|
if err_isin_path and os.path.exists(err_isin_path):
|
||||||
|
err_isin = pd.read_csv(err_isin_path, parse_dates=["date"])
|
||||||
|
err_isin["isin"] = err_isin["isin"].astype(str)
|
||||||
|
if err_agg_path and os.path.exists(err_agg_path):
|
||||||
|
err_agg = pd.read_csv(err_agg_path, parse_dates=["date"])
|
||||||
|
|
||||||
|
return scores, mapping, surgery, err_isin, err_agg
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# LOAD ERROR ACCOUNT (optional)
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def load_error_account(isin_path, agg_path):
|
||||||
|
"""
|
||||||
|
Loads the error account CSVs produced by carmignac_diagnostics.py.
|
||||||
|
Returns (df_err_isin, df_err_agg) or (None, None) if files not found.
|
||||||
|
"""
|
||||||
|
if not isin_path or not agg_path:
|
||||||
|
return None, None
|
||||||
|
try:
|
||||||
|
ei = pd.read_csv(isin_path, parse_dates=["date"])
|
||||||
|
ea = pd.read_csv(agg_path, parse_dates=["date"])
|
||||||
|
ei["isin"] = ei["isin"].astype(str)
|
||||||
|
print(
|
||||||
|
f"[Load] error account (ISIN) : {len(ei)} rows, "
|
||||||
|
f"{ei['isin'].nunique()} ISINs"
|
||||||
|
)
|
||||||
|
print(f"[Load] error account (agg) : {len(ea)} rows")
|
||||||
|
return ei, ea
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Could not load error account: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# 2. COMPUTE ANALYTICS
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def compute_analytics(scores, mapping, surgery):
|
||||||
|
dates = sorted(scores["date"].unique())
|
||||||
|
|
||||||
|
# ── 2.1 Sum of scores per date (post-surgery) ──────────────
|
||||||
|
sum_post = scores.groupby("date")["score"].sum().reindex(dates).rename("sum_post")
|
||||||
|
|
||||||
|
# ── 2.2 Reconstruct pre-surgery (counterfactual) ───────────
|
||||||
|
# Without surgery, every reg_id that had a hard break would score 0
|
||||||
|
# from that date backwards. We propagate the surgery "gain" as a
|
||||||
|
# cumulative deficit going back in time.
|
||||||
|
gain_by_date = surgery.groupby("date")["gain_vs_no_surgery"].sum()
|
||||||
|
# cumulative deficit = sum of gains for all surgeries at or after date t
|
||||||
|
cumulative_deficit = pd.Series(0.0, index=dates)
|
||||||
|
for d in dates:
|
||||||
|
cumulative_deficit[d] = gain_by_date[gain_by_date.index >= d].sum()
|
||||||
|
sum_pre = (sum_post - cumulative_deficit).clip(lower=0).rename("sum_pre")
|
||||||
|
|
||||||
|
timeline = pd.DataFrame({"sum_post": sum_post, "sum_pre": sum_pre})
|
||||||
|
timeline.index = pd.to_datetime(timeline.index)
|
||||||
|
timeline["recovery_pct"] = np.where(
|
||||||
|
sum_pre < sum_post,
|
||||||
|
(sum_post - sum_pre) / sum_post.clip(lower=1e-9) * 100,
|
||||||
|
0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 2.3 Per-date surgery stats ─────────────────────────────
|
||||||
|
surgery_stats = (
|
||||||
|
surgery.groupby("date")
|
||||||
|
.agg(
|
||||||
|
n_surgeries=("reg_orig", "count"),
|
||||||
|
total_gain=("gain_vs_no_surgery", "sum"),
|
||||||
|
avg_gain=("gain_vs_no_surgery", "mean"),
|
||||||
|
avg_jaccard=("jaccard_composite", "mean"),
|
||||||
|
avg_score_before=("score_before", "mean"),
|
||||||
|
avg_score_after=("score_after", "mean"),
|
||||||
|
)
|
||||||
|
.reindex(dates, fill_value=0)
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 2.4 Score distribution over time ───────────────────────
|
||||||
|
# Wide format: rows=dates, cols=reg_ids
|
||||||
|
pivot = scores.pivot_table(
|
||||||
|
index="date", columns="reg_id", values="score", aggfunc="last"
|
||||||
|
)
|
||||||
|
pivot = pivot.reindex(dates)
|
||||||
|
pivot.index = pd.to_datetime(pivot.index)
|
||||||
|
|
||||||
|
# ── 2.5 Mapping churn ──────────────────────────────────────
|
||||||
|
# For each date, how many reg_ids are remapped (not using their original code)?
|
||||||
|
churn = (
|
||||||
|
mapping.groupby("date")["changed"]
|
||||||
|
.sum()
|
||||||
|
.reindex(dates, fill_value=0)
|
||||||
|
.rename("n_remapped")
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 2.6 Score entropy (distribution spread) ────────────────
|
||||||
|
def entropy(row):
|
||||||
|
p = row.dropna()
|
||||||
|
p = p[p > 0]
|
||||||
|
if len(p) == 0:
|
||||||
|
return np.nan
|
||||||
|
p = p / p.sum()
|
||||||
|
return -(p * np.log(p)).sum()
|
||||||
|
|
||||||
|
timeline["entropy"] = pivot.apply(entropy, axis=1).values
|
||||||
|
|
||||||
|
# ── 2.7 Individual score trajectories ──────────────────────
|
||||||
|
# Identify which reg_ids were ever remapped
|
||||||
|
ever_remapped = set(mapping.loc[mapping["changed"], "reg_orig"].unique())
|
||||||
|
|
||||||
|
# ── 2.8 Surgery detail table ───────────────────────────────
|
||||||
|
surgery_detail = surgery.copy()
|
||||||
|
surgery_detail["gain_pct_of_score"] = (
|
||||||
|
surgery_detail["gain_vs_no_surgery"]
|
||||||
|
/ surgery_detail["score_before"].clip(lower=1e-9)
|
||||||
|
* 100
|
||||||
|
).round(2)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"timeline": timeline,
|
||||||
|
"surgery_stats": surgery_stats,
|
||||||
|
"pivot": pivot,
|
||||||
|
"churn": churn,
|
||||||
|
"ever_remapped": ever_remapped,
|
||||||
|
"surgery_detail": surgery_detail,
|
||||||
|
"dates": [d.strftime("%Y-%m-%d") for d in dates],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# 3. PRINT CONSOLE SUMMARY
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(analytics, surgery):
|
||||||
|
tl = analytics["timeline"]
|
||||||
|
ss = analytics["surgery_stats"]
|
||||||
|
|
||||||
|
print("\n" + "=" * 65)
|
||||||
|
print(" CARMIGNAC PIPELINE — RESULTS SUMMARY")
|
||||||
|
print("=" * 65)
|
||||||
|
|
||||||
|
print(f"\n Date range : {tl.index.min().date()} → {tl.index.max().date()}")
|
||||||
|
print(f" Total months : {len(tl)}")
|
||||||
|
print(f" Reg IDs : {analytics['pivot'].shape[1]}")
|
||||||
|
|
||||||
|
print("\n ── Score (Σ) ──────────────────────────────────────────")
|
||||||
|
print(f" At t_ref (latest) : {tl['sum_post'].iloc[-1]:.6f}")
|
||||||
|
print(f" At t_min (earliest): {tl['sum_post'].iloc[0]:.6f}")
|
||||||
|
print(
|
||||||
|
f" Min (post-surgery) : {tl['sum_post'].min():.6f} "
|
||||||
|
f"({tl['sum_post'].idxmin().date()})"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Min (pre-surgery) : {tl['sum_pre'].min():.6f} "
|
||||||
|
f"({tl['sum_pre'].idxmin().date()})"
|
||||||
|
)
|
||||||
|
print(f" Max recovery (pct) : {tl['recovery_pct'].max():.2f}%")
|
||||||
|
|
||||||
|
print("\n ── Surgeries ─────────────────────────────────────────")
|
||||||
|
if len(surgery) == 0:
|
||||||
|
print(" No surgeries performed.")
|
||||||
|
else:
|
||||||
|
print(f" Total operations : {len(surgery)}")
|
||||||
|
print(f" Total score gained : {surgery['gain_vs_no_surgery'].sum():.6f}")
|
||||||
|
print(f" Avg Jaccard : {surgery['jaccard_composite'].mean():.4f}")
|
||||||
|
print(f" Avg gain / surgery : {surgery['gain_vs_no_surgery'].mean():.6f}")
|
||||||
|
print()
|
||||||
|
print(
|
||||||
|
f" {'Date':12s} {'Reg orig':12s} {'From':15s} {'To':15s} "
|
||||||
|
f"{'Jaccard':>8s} {'Gain':>10s}"
|
||||||
|
)
|
||||||
|
print(" " + "-" * 78)
|
||||||
|
for _, row in surgery.sort_values("date").iterrows():
|
||||||
|
print(
|
||||||
|
f" {str(row['date'].date()):12s} {row['reg_orig']:12s} "
|
||||||
|
f"{row['reg_from']:15s} {row['reg_to']:15s} "
|
||||||
|
f"{row['jaccard_composite']:8.4f} {row['gain_vs_no_surgery']:10.6f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n ── Mapping churn ─────────────────────────────────────")
|
||||||
|
ch = analytics["churn"]
|
||||||
|
print(
|
||||||
|
f" Max remapped at one date : {int(ch.max())} ({ch.idxmax().date() if ch.max() > 0 else 'N/A'})"
|
||||||
|
)
|
||||||
|
print(f" Reg IDs ever remapped : {len(analytics['ever_remapped'])}")
|
||||||
|
|
||||||
|
print("\n ── Score entropy (distribution spread) ───────────────")
|
||||||
|
ent = analytics["timeline"]["entropy"]
|
||||||
|
print(f" Mean entropy : {ent.mean():.4f}")
|
||||||
|
print(f" Std entropy : {ent.std():.4f}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# MAIN
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Carmignac pipeline results analyser")
|
||||||
|
parser.add_argument("--scores", default="repair_results/carmignac_scores.csv")
|
||||||
|
parser.add_argument("--mapping", default="repair_results/carmignac_mapping.csv")
|
||||||
|
parser.add_argument("--surgery", default="repair_results/carmignac_surgery_log.csv")
|
||||||
|
parser.add_argument("--out", default="repair_results/carmignac_report.html")
|
||||||
|
parser.add_argument(
|
||||||
|
"--error-account-isin",
|
||||||
|
default=None,
|
||||||
|
dest="error_isin",
|
||||||
|
help="Path to carmignac_error_account.csv (optional)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--error-account-agg",
|
||||||
|
default=None,
|
||||||
|
dest="error_agg",
|
||||||
|
help="Path to carmignac_error_account_agg.csv (optional)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Resolve paths relative to this script's directory if files not found
|
||||||
|
base = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
def resolve(p, required=True):
|
||||||
|
if p is None:
|
||||||
|
return None
|
||||||
|
if os.path.exists(p):
|
||||||
|
return p
|
||||||
|
alt = os.path.join(base, p)
|
||||||
|
if os.path.exists(alt):
|
||||||
|
return alt
|
||||||
|
if required:
|
||||||
|
sys.exit(f"[ERROR] File not found: {p}")
|
||||||
|
print(f"[WARN] Optional file not found: {p}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
scores_path = resolve(args.scores)
|
||||||
|
mapping_path = resolve(args.mapping)
|
||||||
|
surgery_path = resolve(args.surgery)
|
||||||
|
error_isin_path = resolve(args.error_isin, required=False)
|
||||||
|
error_agg_path = resolve(args.error_agg, required=False)
|
||||||
|
|
||||||
|
print(f"[Load] scores : {scores_path}")
|
||||||
|
print(f"[Load] mapping : {mapping_path}")
|
||||||
|
print(f"[Load] surgery : {surgery_path}")
|
||||||
|
|
||||||
|
scores, mapping, surgery, df_err_isin, df_err_agg = load_outputs(
|
||||||
|
scores_path,
|
||||||
|
mapping_path,
|
||||||
|
surgery_path,
|
||||||
|
err_isin_path=error_isin_path,
|
||||||
|
err_agg_path=error_agg_path,
|
||||||
|
)
|
||||||
|
analytics = compute_analytics(scores, mapping, surgery)
|
||||||
|
|
||||||
|
print_summary(analytics, surgery)
|
||||||
|
|
||||||
|
html = build_html_repair(
|
||||||
|
analytics,
|
||||||
|
surgery,
|
||||||
|
scores,
|
||||||
|
mapping,
|
||||||
|
df_err_isin=df_err_isin,
|
||||||
|
df_err_agg=df_err_agg,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_path = "../" + args.out
|
||||||
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
||||||
|
with open(out_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
print(f"\n[Report] Written to → {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
410
src/repair_challenge/carmignac_branch.py
Normal file
410
src/repair_challenge/carmignac_branch.py
Normal file
|
|
@ -0,0 +1,410 @@
|
||||||
|
"""
|
||||||
|
AUM Branching / Repair
|
||||||
|
==================================================
|
||||||
|
Takes as input:
|
||||||
|
- The original AUM file (pre-repair)
|
||||||
|
- The mapping CSV produced by carmignac_repair.py
|
||||||
|
- (Optionally) the surgery log, for audit annotation
|
||||||
|
|
||||||
|
Produces:
|
||||||
|
- A repaired AUM file where every Registrar Account ID is replaced
|
||||||
|
by its canonical identity (reg_orig) as determined by the pipeline.
|
||||||
|
|
||||||
|
Core logic
|
||||||
|
----------
|
||||||
|
The mapping table encodes, for every (date, reg_orig) pair, which
|
||||||
|
physical code (reg_used) was actually present in the data at that date.
|
||||||
|
|
||||||
|
reg_orig = the stable canonical identity (output label)
|
||||||
|
reg_used = the code that appeared in the raw data at that date
|
||||||
|
|
||||||
|
For rows where reg_used != reg_orig (changed=True), the raw code is a
|
||||||
|
historical alias that the surgery pass identified as belonging to
|
||||||
|
reg_orig. The repair simply relabels those rows.
|
||||||
|
|
||||||
|
For accounts not in the repair universe (below the AUM threshold, or
|
||||||
|
excluded categories), rows are passed through unchanged.
|
||||||
|
|
||||||
|
Self-mapped surgeries (reg_from == reg_to in the surgery log) do not
|
||||||
|
require any relabelling — they signal a data quality issue on that
|
||||||
|
month, not a code change.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python carmignac_branch.py # default paths
|
||||||
|
python carmignac_branch.py \\
|
||||||
|
--aum raw_AUM.csv \\
|
||||||
|
--mapping carmignac_mapping.csv \\
|
||||||
|
--surgery carmignac_surgery_log.csv \\
|
||||||
|
--out AUM_repaired.csv
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from helpers import load_inputs_branch
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# BUILD RENAME LOOKUP
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def build_rename_lookup(mapping):
|
||||||
|
"""
|
||||||
|
Returns a dict {(date, reg_used) -> reg_orig}
|
||||||
|
restricted to rows where reg_used != reg_orig (actual changes).
|
||||||
|
|
||||||
|
For self-mapped surgeries or stable accounts, no entry is needed.
|
||||||
|
"""
|
||||||
|
changed = mapping[mapping["changed"] & (mapping["reg_orig"] != mapping["reg_used"])]
|
||||||
|
|
||||||
|
lookup = {}
|
||||||
|
for _, row in changed.iterrows():
|
||||||
|
key = (row["date"], row["reg_used"])
|
||||||
|
if key in lookup and lookup[key] != row["reg_orig"]:
|
||||||
|
print(
|
||||||
|
f" [WARN] Conflicting mapping at {row['date'].date()} "
|
||||||
|
f"reg_used={row['reg_used']}: "
|
||||||
|
f"{lookup[key]} vs {row['reg_orig']} — keeping first"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
lookup[key] = row["reg_orig"]
|
||||||
|
|
||||||
|
return lookup
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# BRANCHING
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def apply_branching(aum, lookup):
|
||||||
|
"""
|
||||||
|
Renames Registrar Account - ID in the AUM dataframe according to
|
||||||
|
the lookup {(date, reg_used) -> reg_orig}.
|
||||||
|
|
||||||
|
Rows not in the lookup are left untouched.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- repaired : the full AUM dataframe with corrected IDs
|
||||||
|
- audit : a subset showing only the renamed rows, with both
|
||||||
|
the original and canonical IDs for verification
|
||||||
|
"""
|
||||||
|
aum = aum.copy()
|
||||||
|
aum["Centralisation Date"] = pd.to_datetime(aum["Centralisation Date"])
|
||||||
|
aum["_date_key"] = aum["Centralisation Date"]
|
||||||
|
aum["_reg_key"] = aum["Registrar Account - ID"].astype(str)
|
||||||
|
|
||||||
|
# Vectorised lookup via merge
|
||||||
|
lookup_df = pd.DataFrame(
|
||||||
|
[(d, reg_used, reg_orig) for (d, reg_used), reg_orig in lookup.items()],
|
||||||
|
columns=["_date_key", "_reg_key", "_canonical_id"],
|
||||||
|
)
|
||||||
|
|
||||||
|
merged = aum.merge(lookup_df, on=["_date_key", "_reg_key"], how="left")
|
||||||
|
|
||||||
|
# Audit: rows that were actually renamed
|
||||||
|
renamed_mask = merged["_canonical_id"].notna()
|
||||||
|
audit = merged[renamed_mask].copy()
|
||||||
|
audit["original_reg_id"] = audit["_reg_key"]
|
||||||
|
audit["canonical_reg_id"] = audit["_canonical_id"]
|
||||||
|
audit = audit[
|
||||||
|
[
|
||||||
|
"Centralisation Date",
|
||||||
|
"original_reg_id",
|
||||||
|
"canonical_reg_id",
|
||||||
|
"Product - Isin",
|
||||||
|
"Quantity - AUM",
|
||||||
|
"Value - AUM €",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Rename
|
||||||
|
merged.loc[renamed_mask, "Registrar Account - ID"] = merged.loc[
|
||||||
|
renamed_mask, "_canonical_id"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Drop helper columns
|
||||||
|
repaired = merged.drop(columns=["_date_key", "_reg_key", "_canonical_id"])
|
||||||
|
|
||||||
|
return repaired, audit
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# CONSISTENCY CHECK
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def consistency_check(original, repaired, mapping, surgery):
|
||||||
|
"""
|
||||||
|
Sanity checks after branching:
|
||||||
|
|
||||||
|
1. Row count preserved
|
||||||
|
2. No reg_used alias remains in the repaired file (for changed entries)
|
||||||
|
3. For each (reg_orig, isin, date) there is at most one row
|
||||||
|
(branching should not create duplicates)
|
||||||
|
4. Summary of surgery operations applied
|
||||||
|
"""
|
||||||
|
print("\n[Consistency checks]")
|
||||||
|
|
||||||
|
# Row count
|
||||||
|
if len(original) == len(repaired):
|
||||||
|
print(f" ✓ Row count preserved : {len(repaired)}")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Row count changed : {len(original)} → {len(repaired)}")
|
||||||
|
|
||||||
|
# Aliases eliminated
|
||||||
|
changed = mapping[mapping["changed"] & (mapping["reg_orig"] != mapping["reg_used"])]
|
||||||
|
aliases = set(changed["reg_used"].unique())
|
||||||
|
still_present = set(repaired["Registrar Account - ID"].astype(str)) & aliases
|
||||||
|
if not still_present:
|
||||||
|
print(f" ✓ All {len(aliases)} aliased code(s) successfully relabelled")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f" ✗ {len(still_present)} aliased code(s) still present: {still_present}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Duplicates
|
||||||
|
key_cols = ["Registrar Account - ID", "Product - Isin", "Centralisation Date"]
|
||||||
|
dup_count = repaired.duplicated(subset=key_cols).sum()
|
||||||
|
if dup_count == 0:
|
||||||
|
print(" ✓ No duplicate (reg_id, isin, date) keys")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f" ✗ {dup_count} duplicate (reg_id, isin, date) rows found — inspect manually"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
repaired[repaired.duplicated(subset=key_cols, keep=False)][
|
||||||
|
key_cols + ["Quantity - AUM"]
|
||||||
|
]
|
||||||
|
.head(10)
|
||||||
|
.to_string(index=False)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Surgery summary
|
||||||
|
if not surgery.empty:
|
||||||
|
print("\n[Surgery operations applied]")
|
||||||
|
for _, op in surgery.sort_values("date").iterrows():
|
||||||
|
self_map = (
|
||||||
|
" [self-map — data quality flag, no rename]"
|
||||||
|
if op["reg_from"] == op["reg_to"]
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" {op['date'].date()} | {op['reg_orig']} : "
|
||||||
|
f"{op['reg_from']} → {op['reg_to']}"
|
||||||
|
f" (Jaccard={op['jaccard_composite']:.4f}, "
|
||||||
|
f"gain={op['gain_vs_no_surgery']:.6f}){self_map}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# EXPORT PATHS (branched accounts only)
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def export_paths(aum, mapping, surgery, repaired):
|
||||||
|
"""
|
||||||
|
Builds a condensed AUM file for ALL accounts in the repair universe
|
||||||
|
(i.e. every reg_orig present in the mapping).
|
||||||
|
|
||||||
|
- Stable accounts (no surgery): single leg where reg_used == reg_orig
|
||||||
|
throughout, pulled directly from the repaired AUM.
|
||||||
|
- Branched accounts (at least one genuine surgery): multiple legs,
|
||||||
|
reg_used shows which physical code was active at each date.
|
||||||
|
|
||||||
|
The output makes every account's full path explicit:
|
||||||
|
|
||||||
|
reg_orig | reg_used | date | isin | qty_aum | ...
|
||||||
|
─────────┼───────────────┼────────────┼──────┼─────────┼───
|
||||||
|
REG_001 | REG_001 | 2020-01-31 | ... | ... | <- stable
|
||||||
|
REG_002 | REG_002_OLD | 2020-01-31 | ... | ... | <- leg 1
|
||||||
|
REG_002 | REG_002 | 2022-07-31 | ... | ... | <- leg 2
|
||||||
|
|
||||||
|
Self-mapped surgeries (reg_from == reg_to) are noted in the summary
|
||||||
|
but do not add extra legs — the account kept its code.
|
||||||
|
|
||||||
|
Returns the paths DataFrame (never None if mapping is non-empty).
|
||||||
|
"""
|
||||||
|
# All canonical accounts in the universe
|
||||||
|
all_accounts = sorted(mapping["reg_orig"].astype(str).unique())
|
||||||
|
|
||||||
|
# Branched accounts (genuine code changes only)
|
||||||
|
branched_accounts = set()
|
||||||
|
if not surgery.empty:
|
||||||
|
genuine = surgery[surgery["reg_from"] != surgery["reg_to"]]
|
||||||
|
branched_accounts = set(genuine["reg_orig"].astype(str).unique())
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n[Paths] {len(all_accounts)} account(s) in universe, "
|
||||||
|
f"{len(branched_accounts)} branched: "
|
||||||
|
f"{sorted(branched_accounts) or 'none'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build (date, reg_orig) → reg_used lookup from mapping
|
||||||
|
map_df = mapping[["date", "reg_orig", "reg_used"]].copy()
|
||||||
|
map_df["date"] = pd.to_datetime(map_df["date"])
|
||||||
|
map_df["reg_orig"] = map_df["reg_orig"].astype(str)
|
||||||
|
map_df["reg_used"] = map_df["reg_used"].astype(str)
|
||||||
|
map_df = map_df.rename(columns={"date": "_date_key", "reg_orig": "_reg_key"})
|
||||||
|
|
||||||
|
# Pull all universe rows from the repaired AUM
|
||||||
|
aum_universe = repaired[
|
||||||
|
repaired["Registrar Account - ID"].astype(str).isin(all_accounts)
|
||||||
|
].copy()
|
||||||
|
aum_universe["Centralisation Date"] = pd.to_datetime(
|
||||||
|
aum_universe["Centralisation Date"]
|
||||||
|
)
|
||||||
|
aum_universe["_date_key"] = aum_universe["Centralisation Date"]
|
||||||
|
aum_universe["_reg_key"] = aum_universe["Registrar Account - ID"].astype(str)
|
||||||
|
|
||||||
|
# Join reg_used from mapping
|
||||||
|
paths = aum_universe.merge(
|
||||||
|
map_df[["_date_key", "_reg_key", "reg_used"]],
|
||||||
|
on=["_date_key", "_reg_key"],
|
||||||
|
how="left",
|
||||||
|
).drop(columns=["_date_key", "_reg_key"])
|
||||||
|
|
||||||
|
# For stable accounts, mapping may not cover every AUM date (e.g. sparse
|
||||||
|
# months) — fall back to reg_orig (= Registrar Account - ID) for those.
|
||||||
|
paths["reg_used"] = paths["reg_used"].fillna(
|
||||||
|
paths["Registrar Account - ID"].astype(str)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rename canonical column
|
||||||
|
paths = paths.rename(columns={"Registrar Account - ID": "reg_orig"})
|
||||||
|
|
||||||
|
# Column order
|
||||||
|
other_cols = [c for c in paths.columns if c not in ("reg_orig", "reg_used")]
|
||||||
|
paths = paths[["reg_orig", "reg_used"] + other_cols]
|
||||||
|
paths = paths.sort_values(["reg_orig", "Centralisation Date", "Product - Isin"])
|
||||||
|
paths = paths.reset_index(drop=True)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
for acc in all_accounts:
|
||||||
|
sub = paths[paths["reg_orig"] == acc]
|
||||||
|
legs = list(sub["reg_used"].unique())
|
||||||
|
tag = " [branched]" if acc in branched_accounts else " [stable]"
|
||||||
|
print(f" {acc}: {len(sub)} rows, legs = {legs}{tag}")
|
||||||
|
|
||||||
|
return paths
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# MAIN
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Apply Carmignac repair mapping to the raw AUM file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--mapping",
|
||||||
|
default="repair_results/carmignac_mapping.csv",
|
||||||
|
help="Path to mapping CSV from carmignac_repair.py",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--surgery",
|
||||||
|
default="repair_results/carmignac_surgery_log.csv",
|
||||||
|
help="Path to surgery log CSV (optional, for audit)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out", default="AUM_repaired.csv", help="Output path for repaired AUM CSV"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--audit",
|
||||||
|
default="AUM_repair_audit.csv",
|
||||||
|
help="Output path for audit CSV (renamed rows only)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--paths",
|
||||||
|
default="AUM_paths.csv",
|
||||||
|
help="Output path for condensed paths CSV (branched accounts only)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def resolve(p, required=True):
|
||||||
|
if os.path.exists(p):
|
||||||
|
return p
|
||||||
|
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
|
||||||
|
if os.path.exists(alt):
|
||||||
|
return alt
|
||||||
|
if required:
|
||||||
|
sys.exit(f"[ERROR] File not found: {p}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
mapping_path = resolve(args.mapping)
|
||||||
|
surgery_path = resolve(args.surgery, required=False)
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("CARMIGNAC — AUM Branching / Repair")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f" Mapping : {mapping_path}")
|
||||||
|
print(f" Surgery : {surgery_path or '(not provided)'}")
|
||||||
|
|
||||||
|
# Load
|
||||||
|
aum, mapping, surgery = load_inputs_branch(mapping_path, surgery_path)
|
||||||
|
print(f"\n Raw AUM rows : {len(aum)}")
|
||||||
|
print(f" Mapping rows : {len(mapping)}")
|
||||||
|
print(f" Mapping changed rows : {mapping['changed'].sum()}")
|
||||||
|
print(f" Surgery operations : {len(surgery)}")
|
||||||
|
|
||||||
|
# Build lookup
|
||||||
|
lookup = build_rename_lookup(mapping)
|
||||||
|
print(f"\n Rename operations : {len(lookup)}")
|
||||||
|
if lookup:
|
||||||
|
sample = list(lookup.items())[:3]
|
||||||
|
for (d, used), orig in sample:
|
||||||
|
print(f" ({d.date()}, {used}) → {orig}")
|
||||||
|
if len(lookup) > 3:
|
||||||
|
print(f" ... and {len(lookup) - 3} more")
|
||||||
|
|
||||||
|
# Apply
|
||||||
|
repaired, audit = apply_branching(aum, lookup)
|
||||||
|
print(f"\n Rows renamed : {len(audit)}")
|
||||||
|
|
||||||
|
# Checks
|
||||||
|
consistency_check(aum, repaired, mapping, surgery)
|
||||||
|
|
||||||
|
# Save
|
||||||
|
out_dir = os.path.dirname(os.path.abspath(args.out))
|
||||||
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
|
|
||||||
|
repaired.to_csv(args.out, index=False)
|
||||||
|
print(f"\n ✓ Repaired AUM → {args.out}")
|
||||||
|
|
||||||
|
if len(audit) > 0:
|
||||||
|
audit.to_csv(args.audit, index=False)
|
||||||
|
print(f" ✓ Audit log → {args.audit}")
|
||||||
|
else:
|
||||||
|
print("(No rows renamed — audit log not written)")
|
||||||
|
|
||||||
|
# Paths: condensed AUM for branched accounts
|
||||||
|
df_paths = export_paths(aum, mapping, surgery, repaired)
|
||||||
|
if df_paths is not None:
|
||||||
|
df_paths.to_csv(args.paths, index=False)
|
||||||
|
print(f" ✓ Paths file → {args.paths}")
|
||||||
|
|
||||||
|
# Print renamed reg_ids summary
|
||||||
|
if len(audit) > 0:
|
||||||
|
print("\n[Renamed identifiers]")
|
||||||
|
summary = (
|
||||||
|
audit.groupby(["original_reg_id", "canonical_reg_id"])
|
||||||
|
.size()
|
||||||
|
.reset_index(name="n_rows")
|
||||||
|
)
|
||||||
|
for _, row in summary.iterrows():
|
||||||
|
print(
|
||||||
|
f" {row['original_reg_id']:20s} → {row['canonical_reg_id']:20s} "
|
||||||
|
f"({row['n_rows']} rows)"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nDone.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
632
src/repair_challenge/carmignac_diagnostics.py
Normal file
632
src/repair_challenge/carmignac_diagnostics.py
Normal file
|
|
@ -0,0 +1,632 @@
|
||||||
|
"""
|
||||||
|
Broken Months Diagnostics
|
||||||
|
=====================================================
|
||||||
|
Detects months where the aggregate stock-flow equation is violated at the ISIN level (across all accounts)
|
||||||
|
The residual is the "missing flow":
|
||||||
|
missing_{s}(t) = [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
||||||
|
|
||||||
|
This is a market-level check, independent of individual account identity.
|
||||||
|
It captures:
|
||||||
|
- Genuinely missing flow records
|
||||||
|
- End-of-month accounting lags (transactions dated at boundary)
|
||||||
|
- Corporate actions (dividends, splits) not reflected in flows
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
carmignac_broken_months.csv — machine-readable, loaded by carmignac_repair.py
|
||||||
|
carmignac_diagnostics.html — interactive HTML report
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python carmignac_diagnostics.py
|
||||||
|
python carmignac_diagnostics.py \\
|
||||||
|
--aum raw_AUM.csv \\
|
||||||
|
--flows raw_flows.csv \\
|
||||||
|
--out carmignac_broken_months.csv \\
|
||||||
|
--html carmignac_diagnostics.html \\
|
||||||
|
--alpha 0.02
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from helpers import build_html_diagnostics, load_data_diagnostics
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# AGGREGATE AND DETECT BROKEN MONTHS
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def detect_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||||
|
"""
|
||||||
|
For each (isin, month-end t), compute:
|
||||||
|
- Q_agg(t) : total shares held across all accounts
|
||||||
|
- Q_agg(t-1) : idem previous month (forward-filled)
|
||||||
|
- F_agg(t) : total net flows recorded in ]EOM(t-1), EOM(t)]
|
||||||
|
- missing(t) : [Q_agg(t) - Q_agg(t-1)] - F_agg(t)
|
||||||
|
- missing_pct : |missing| / max(Q_agg(t), Q_agg(t-1))
|
||||||
|
|
||||||
|
A month is flagged as "broken" when missing_pct > alpha.
|
||||||
|
|
||||||
|
Additionally, a month is flagged as a potential "lag" when:
|
||||||
|
- It is broken with the standard window
|
||||||
|
- But would NOT be broken if flows dated within lag_days of EOM
|
||||||
|
are shifted to the adjacent month
|
||||||
|
|
||||||
|
Parameters :
|
||||||
|
alpha : tolerance threshold (same as ALPHA in carmignac_repair.py)
|
||||||
|
lag_days : number of boundary days to test for accounting lag
|
||||||
|
|
||||||
|
Returns :
|
||||||
|
df_broken : DataFrame with all (isin, date) pairs where missing_pct > alpha
|
||||||
|
df_all : Full DataFrame including non-broken months (for plotting)
|
||||||
|
"""
|
||||||
|
# Monthly calendar
|
||||||
|
t_min = aum["Centralisation Date"].min()
|
||||||
|
t_max = aum["Centralisation Date"].max()
|
||||||
|
all_months = pd.date_range(t_min, t_max, freq="ME")
|
||||||
|
|
||||||
|
# ── Aggregate AUM per (isin, month-end) ──────────────────────
|
||||||
|
aum_agg = (
|
||||||
|
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
.rename(
|
||||||
|
columns={
|
||||||
|
"Product - Isin": "isin",
|
||||||
|
"Centralisation Date": "date",
|
||||||
|
"Quantity - AUM": "qty_agg",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Forward-fill sparse panel
|
||||||
|
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty_agg")
|
||||||
|
aum_pivot = aum_pivot.reindex(all_months).ffill()
|
||||||
|
|
||||||
|
# ── Aggregate flows per (isin, month-end) — standard window ──
|
||||||
|
def bucket_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
||||||
|
"""Aggregate flows with optional boundary extension (in days)."""
|
||||||
|
fc = flows_df.copy()
|
||||||
|
|
||||||
|
def assign_month(d):
|
||||||
|
# Extended window: ]EOM(t-1) - lower_offset, EOM(t) + upper_offset]
|
||||||
|
for m in months:
|
||||||
|
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||||
|
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
||||||
|
hi = m + pd.Timedelta(days=upper_offset)
|
||||||
|
if lo < d <= hi:
|
||||||
|
return m
|
||||||
|
return pd.NaT
|
||||||
|
|
||||||
|
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
||||||
|
fc = fc.dropna(subset=["month_end"])
|
||||||
|
agg = (
|
||||||
|
fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
.rename(
|
||||||
|
columns={
|
||||||
|
"Product - Isin": "isin",
|
||||||
|
"month_end": "date",
|
||||||
|
"Quantity - NetFlows": "flow_agg",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return agg
|
||||||
|
|
||||||
|
flows_std = bucket_flows(flows, all_months)
|
||||||
|
flows_lag = bucket_flows(
|
||||||
|
flows, all_months, lower_offset=lag_days, upper_offset=lag_days
|
||||||
|
)
|
||||||
|
|
||||||
|
def flows_to_pivot(df, months):
|
||||||
|
piv = df.pivot(index="date", columns="isin", values="flow_agg")
|
||||||
|
return piv.reindex(months).fillna(0.0)
|
||||||
|
|
||||||
|
fpiv_std = flows_to_pivot(flows_std, all_months)
|
||||||
|
fpiv_lag = flows_to_pivot(flows_lag, all_months)
|
||||||
|
|
||||||
|
# ── Compute residuals ─────────────────────────────────────────
|
||||||
|
rows = []
|
||||||
|
isins = aum_pivot.columns.tolist()
|
||||||
|
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
t_prev = all_months[i - 1]
|
||||||
|
|
||||||
|
for isin in isins:
|
||||||
|
q_curr = (
|
||||||
|
aum_pivot[isin].get(t_curr, np.nan)
|
||||||
|
if isin in aum_pivot.columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
q_prev = (
|
||||||
|
aum_pivot[isin].get(t_prev, np.nan)
|
||||||
|
if isin in aum_pivot.columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
|
||||||
|
if pd.isna(q_curr) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
|
||||||
|
delta = q_curr - q_prev
|
||||||
|
|
||||||
|
# Standard window
|
||||||
|
f_std = fpiv_std[isin].get(t_curr, 0.0) if isin in fpiv_std.columns else 0.0
|
||||||
|
missing_std = delta - f_std
|
||||||
|
|
||||||
|
# Extended lag window
|
||||||
|
f_lag = fpiv_lag[isin].get(t_curr, 0.0) if isin in fpiv_lag.columns else 0.0
|
||||||
|
missing_lag = delta - f_lag
|
||||||
|
|
||||||
|
# ── Denominator choice ────────────────────────────────
|
||||||
|
# Normalise by the size of the *movement* (max of delta_AUM
|
||||||
|
# and recorded flow), not by the stock level. This avoids
|
||||||
|
# astronomically large percentages when a position is tiny
|
||||||
|
# but the missing flow is a normal-sized number.
|
||||||
|
#
|
||||||
|
# Interpretation: "what fraction of the expected movement
|
||||||
|
# is unaccounted for?"
|
||||||
|
#
|
||||||
|
# A minimum absolute threshold (min_abs_shares) suppresses
|
||||||
|
# noise from residual micro-positions (rounding artefacts).
|
||||||
|
min_abs_shares = 1.0 # ignore positions smaller than 1 share
|
||||||
|
movement = max(abs(delta), abs(f_std), min_abs_shares)
|
||||||
|
denom_std = movement
|
||||||
|
|
||||||
|
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
||||||
|
denom_lag = movement_lag
|
||||||
|
|
||||||
|
pct_std = abs(missing_std) / denom_std
|
||||||
|
pct_lag = abs(missing_lag) / denom_lag
|
||||||
|
|
||||||
|
broken_std = pct_std > alpha
|
||||||
|
broken_lag = pct_lag > alpha
|
||||||
|
|
||||||
|
# A "lag" month: broken with standard, NOT broken with extended window
|
||||||
|
is_lag = broken_std and (not broken_lag)
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"date": t_curr,
|
||||||
|
"isin": isin,
|
||||||
|
"q_agg_prev": round(q_prev, 3),
|
||||||
|
"q_agg_curr": round(q_curr, 3),
|
||||||
|
"delta_aum": round(delta, 3),
|
||||||
|
"flow_agg": round(f_std, 3),
|
||||||
|
"missing_flow": round(missing_std, 3),
|
||||||
|
"missing_pct": round(pct_std, 6),
|
||||||
|
"broken": broken_std,
|
||||||
|
"is_lag": is_lag,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
df_all = pd.DataFrame(rows)
|
||||||
|
df_broken = df_all[df_all["broken"]].sort_values("missing_pct", ascending=False)
|
||||||
|
return df_broken, df_all
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# AGGREGATE (CROSS-ISIN) BROKEN MONTHS
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def detect_aggregate_broken_months(aum, flows, alpha=0.02, lag_days=3):
|
||||||
|
"""
|
||||||
|
Same stock-flow check as detect_broken_months, but aggregated
|
||||||
|
across ALL ISINs for each month:
|
||||||
|
|
||||||
|
Q_total(t) - Q_total(t-1) != F_total(t)
|
||||||
|
|
||||||
|
where Q_total(t) = sum over all (reg_id, isin) of Q_{r,s}(t).
|
||||||
|
|
||||||
|
This catches months where the global portfolio is incoherent even
|
||||||
|
if every individual ISIN is fine (e.g. cross-ISIN netting errors),
|
||||||
|
and provides a cleaner high-level view.
|
||||||
|
|
||||||
|
Returns :
|
||||||
|
df_agg : DataFrame indexed by month with columns:
|
||||||
|
q_total_prev, q_total_curr, delta_aum,
|
||||||
|
flow_total, missing_flow, missing_pct, broken, is_lag
|
||||||
|
"""
|
||||||
|
t_min = aum["Centralisation Date"].min()
|
||||||
|
t_max = aum["Centralisation Date"].max()
|
||||||
|
all_months = pd.date_range(t_min, t_max, freq="ME")
|
||||||
|
|
||||||
|
# ── Total AUM per month (all ISIN, all accounts) ─────────────
|
||||||
|
aum_monthly = (
|
||||||
|
aum.groupby("Centralisation Date")["Quantity - AUM"]
|
||||||
|
.sum()
|
||||||
|
.reindex(all_months)
|
||||||
|
.ffill()
|
||||||
|
.rename("q_total")
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Bucket flows helper (reuse same window logic) ─────────────
|
||||||
|
def bucket_total_flows(flows_df, months, lower_offset=0, upper_offset=0):
|
||||||
|
fc = flows_df.copy()
|
||||||
|
|
||||||
|
def assign_month(d):
|
||||||
|
for m in months:
|
||||||
|
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||||
|
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
||||||
|
hi = m + pd.Timedelta(days=upper_offset)
|
||||||
|
if lo < d <= hi:
|
||||||
|
return m
|
||||||
|
return pd.NaT
|
||||||
|
|
||||||
|
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
||||||
|
fc = fc.dropna(subset=["month_end"])
|
||||||
|
return (
|
||||||
|
fc.groupby("month_end")["Quantity - NetFlows"]
|
||||||
|
.sum()
|
||||||
|
.reindex(months)
|
||||||
|
.fillna(0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
flow_std = bucket_total_flows(flows, all_months)
|
||||||
|
flow_lag = bucket_total_flows(
|
||||||
|
flows, all_months, lower_offset=lag_days, upper_offset=lag_days
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Compute residuals ─────────────────────────────────────────
|
||||||
|
rows = []
|
||||||
|
min_abs_shares = 1.0
|
||||||
|
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
t_prev = all_months[i - 1]
|
||||||
|
|
||||||
|
q_curr = aum_monthly.get(t_curr, np.nan)
|
||||||
|
q_prev = aum_monthly.get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_curr) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
|
||||||
|
delta = q_curr - q_prev
|
||||||
|
|
||||||
|
f_std = flow_std.get(t_curr, 0.0)
|
||||||
|
f_lag = flow_lag.get(t_curr, 0.0)
|
||||||
|
miss_std = delta - f_std
|
||||||
|
miss_lag = delta - f_lag
|
||||||
|
|
||||||
|
movement_std = max(abs(delta), abs(f_std), min_abs_shares)
|
||||||
|
movement_lag = max(abs(delta), abs(f_lag), min_abs_shares)
|
||||||
|
pct_std = abs(miss_std) / movement_std
|
||||||
|
pct_lag = abs(miss_lag) / movement_lag
|
||||||
|
|
||||||
|
broken_std = pct_std > alpha
|
||||||
|
broken_lag = pct_lag > alpha
|
||||||
|
is_lag = broken_std and (not broken_lag)
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"date": t_curr,
|
||||||
|
"q_total_prev": round(q_prev, 3),
|
||||||
|
"q_total_curr": round(q_curr, 3),
|
||||||
|
"delta_aum": round(delta, 3),
|
||||||
|
"flow_total": round(f_std, 3),
|
||||||
|
"missing_flow": round(miss_std, 3),
|
||||||
|
"missing_pct": round(pct_std, 6),
|
||||||
|
"broken": broken_std,
|
||||||
|
"is_lag": is_lag,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
df_agg = pd.DataFrame(rows)
|
||||||
|
return df_agg
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# ERROR ACCOUNT
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def build_error_account(aum, flows, lag_days=3):
|
||||||
|
"""
|
||||||
|
Builds a synthetic "error account" that absorbs the stock-flow
|
||||||
|
residuals that cannot be explained by recorded flows.
|
||||||
|
|
||||||
|
Construction (backwards from t_ref):
|
||||||
|
Stock_error(t_ref) = 0 (by definition)
|
||||||
|
Stock_error(t-1) = Stock_error(t) - Residual(t)
|
||||||
|
|
||||||
|
where Residual(t) = [Σ_r Q_{r,s}(t) - Σ_r Q_{r,s}(t-1)] - Σ_r F_{r,s}(t)
|
||||||
|
for each ISIN s independently.
|
||||||
|
|
||||||
|
By construction, adding this error account to the AUM restores the
|
||||||
|
stock-flow equality at every (isin, month).
|
||||||
|
|
||||||
|
Also computes an aggregated error account (summed over all ISINs).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_err_isin : DataFrame with columns
|
||||||
|
(date, isin, residual, stock_error, stock_error_pct)
|
||||||
|
where stock_error_pct = stock_error / max(total_isin_aum, 1)
|
||||||
|
|
||||||
|
df_err_agg : DataFrame with columns
|
||||||
|
(date, residual_agg, stock_error_agg, stock_error_agg_pct)
|
||||||
|
"""
|
||||||
|
t_min = aum["Centralisation Date"].min()
|
||||||
|
t_max = aum["Centralisation Date"].max()
|
||||||
|
all_months = pd.date_range(t_min, t_max, freq="ME")
|
||||||
|
|
||||||
|
# ── ISIN-level AUM panel (forward-filled) ────────────────────
|
||||||
|
aum_agg = (
|
||||||
|
aum.groupby(["Product - Isin", "Centralisation Date"])["Quantity - AUM"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
.rename(
|
||||||
|
columns={
|
||||||
|
"Product - Isin": "isin",
|
||||||
|
"Centralisation Date": "date",
|
||||||
|
"Quantity - AUM": "qty",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
aum_pivot = aum_agg.pivot(index="date", columns="isin", values="qty")
|
||||||
|
aum_pivot = aum_pivot.reindex(all_months).ffill()
|
||||||
|
|
||||||
|
# ── ISIN-level flow aggregation (standard window) ─────────────
|
||||||
|
def bucket_isin_flows(flows_df, months):
|
||||||
|
fc = flows_df.copy()
|
||||||
|
|
||||||
|
def assign_month(d):
|
||||||
|
for m in months:
|
||||||
|
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||||
|
if eom_prev < d <= m:
|
||||||
|
return m
|
||||||
|
return pd.NaT
|
||||||
|
|
||||||
|
fc["month_end"] = fc["Centralisation Date"].apply(assign_month)
|
||||||
|
fc = fc.dropna(subset=["month_end"])
|
||||||
|
return (
|
||||||
|
fc.groupby(["Product - Isin", "month_end"])["Quantity - NetFlows"]
|
||||||
|
.sum()
|
||||||
|
.unstack("Product - Isin")
|
||||||
|
.reindex(months)
|
||||||
|
.fillna(0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
flow_pivot = bucket_isin_flows(flows, all_months)
|
||||||
|
|
||||||
|
# ── Compute residuals per (isin, month) ───────────────────────
|
||||||
|
isins = aum_pivot.columns.tolist()
|
||||||
|
# residual[t] = delta_AUM[t] - flow[t]
|
||||||
|
residuals = {} # {isin: Series indexed by month}
|
||||||
|
|
||||||
|
for isin in isins:
|
||||||
|
res_series = {}
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
t_prev = all_months[i - 1]
|
||||||
|
q_curr = aum_pivot[isin].get(t_curr, np.nan)
|
||||||
|
q_prev = aum_pivot[isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_curr) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
delta = q_curr - q_prev
|
||||||
|
f = flow_pivot[isin].get(t_curr, 0.0) if isin in flow_pivot.columns else 0.0
|
||||||
|
res_series[t_curr] = delta - f
|
||||||
|
residuals[isin] = pd.Series(res_series)
|
||||||
|
|
||||||
|
# ── Build error stock backwards from t_ref ────────────────────
|
||||||
|
t_ref = all_months[-1]
|
||||||
|
rows_isin = []
|
||||||
|
|
||||||
|
for isin in isins:
|
||||||
|
res = residuals[isin]
|
||||||
|
# Maximum AUM for this ISIN (for normalisation)
|
||||||
|
max_aum = aum_pivot[isin].max()
|
||||||
|
if pd.isna(max_aum) or max_aum < 1:
|
||||||
|
max_aum = 1.0
|
||||||
|
|
||||||
|
# Propagate backwards: stock(t_ref) = 0
|
||||||
|
stock = 0.0
|
||||||
|
# Build dict keyed by date
|
||||||
|
stock_by_date = {t_ref: 0.0}
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
t_prev = all_months[i]
|
||||||
|
r = res.get(t_curr, 0.0)
|
||||||
|
stock = stock - r
|
||||||
|
stock_by_date[t_prev] = stock
|
||||||
|
|
||||||
|
for t in all_months:
|
||||||
|
s = stock_by_date.get(t, np.nan)
|
||||||
|
r = res.get(t, 0.0)
|
||||||
|
rows_isin.append(
|
||||||
|
{
|
||||||
|
"date": t,
|
||||||
|
"isin": isin,
|
||||||
|
"residual": round(r, 4),
|
||||||
|
"stock_error": round(s, 4) if not pd.isna(s) else np.nan,
|
||||||
|
"stock_error_pct": round(abs(s) / max_aum * 100, 4)
|
||||||
|
if not pd.isna(s)
|
||||||
|
else np.nan,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
df_err_isin = pd.DataFrame(rows_isin).sort_values(["date", "isin"])
|
||||||
|
|
||||||
|
# ── Aggregated error account ──────────────────────────────────
|
||||||
|
# Total AUM across all ISINs at each month
|
||||||
|
total_aum_by_month = aum_pivot.sum(axis=1)
|
||||||
|
max_total_aum = total_aum_by_month.max()
|
||||||
|
if pd.isna(max_total_aum) or max_total_aum < 1:
|
||||||
|
max_total_aum = 1.0
|
||||||
|
|
||||||
|
# Aggregate residual = sum of ISIN residuals
|
||||||
|
agg_res = {}
|
||||||
|
for i in range(1, len(all_months)):
|
||||||
|
t_curr = all_months[i]
|
||||||
|
total_r = sum(residuals[isin].get(t_curr, 0.0) for isin in isins)
|
||||||
|
agg_res[t_curr] = total_r
|
||||||
|
|
||||||
|
agg_stock = 0.0
|
||||||
|
agg_stock_by_date = {t_ref: 0.0}
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
t_prev = all_months[i]
|
||||||
|
agg_stock = agg_stock - agg_res.get(t_curr, 0.0)
|
||||||
|
agg_stock_by_date[t_prev] = agg_stock
|
||||||
|
|
||||||
|
rows_agg = []
|
||||||
|
for t in all_months:
|
||||||
|
s = agg_stock_by_date.get(t, np.nan)
|
||||||
|
r = agg_res.get(t, 0.0)
|
||||||
|
rows_agg.append(
|
||||||
|
{
|
||||||
|
"date": t,
|
||||||
|
"residual_agg": round(r, 4),
|
||||||
|
"stock_error_agg": round(s, 4) if not pd.isna(s) else np.nan,
|
||||||
|
"stock_error_agg_pct": round(abs(s) / max_total_aum * 100, 4)
|
||||||
|
if not pd.isna(s)
|
||||||
|
else np.nan,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
df_err_agg = pd.DataFrame(rows_agg).sort_values("date")
|
||||||
|
return df_err_isin, df_err_agg
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# PRINT SUMMARY
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(df_broken, df_all, alpha):
|
||||||
|
total = len(df_all)
|
||||||
|
n_broken = len(df_broken)
|
||||||
|
n_lag = df_broken["is_lag"].sum()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(" CARMIGNAC — Broken Months Diagnostics")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f" (isin, month) pairs examined : {total}")
|
||||||
|
print(
|
||||||
|
f" Broken (missing_pct > {alpha:.0%}) : {n_broken} "
|
||||||
|
f"({n_broken / total * 100:.1f}%)"
|
||||||
|
)
|
||||||
|
print(f" Of which likely lag : {n_lag}")
|
||||||
|
print(f" Of which genuine gap : {n_broken - n_lag}")
|
||||||
|
|
||||||
|
if n_broken:
|
||||||
|
print("\n Top 10 by missing_pct:")
|
||||||
|
cols = ["date", "isin", "missing_flow", "missing_pct", "is_lag"]
|
||||||
|
print(df_broken[cols].head(10).to_string(index=False))
|
||||||
|
|
||||||
|
# Monthly breakdown
|
||||||
|
by_month = (
|
||||||
|
df_broken.groupby("date")
|
||||||
|
.agg(
|
||||||
|
n_broken=("isin", "count"),
|
||||||
|
total_missing=("missing_flow", lambda x: x.abs().sum()),
|
||||||
|
)
|
||||||
|
.sort_values("n_broken", ascending=False)
|
||||||
|
.head(5)
|
||||||
|
)
|
||||||
|
if len(by_month):
|
||||||
|
print("\n Most affected months:")
|
||||||
|
print(by_month.to_string())
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
# MAIN
|
||||||
|
# ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Detect broken months in Carmignac AUM/Flows data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out",
|
||||||
|
default="carmignac_broken_months.csv",
|
||||||
|
help="Machine-readable output (loaded by carmignac_repair.py)",
|
||||||
|
)
|
||||||
|
parser.add_argument("--html", default="carmignac_diagnostics.html")
|
||||||
|
parser.add_argument(
|
||||||
|
"--alpha",
|
||||||
|
type=float,
|
||||||
|
default=0.05,
|
||||||
|
help="Tolerance threshold (default 0.05 = 5%%)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lag",
|
||||||
|
type=int,
|
||||||
|
default=3,
|
||||||
|
help="Boundary days to test for accounting lag (default 3)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def resolve(p):
|
||||||
|
if os.path.exists(p):
|
||||||
|
return p
|
||||||
|
alt = os.path.join(os.path.dirname(os.path.abspath(__file__)), p)
|
||||||
|
if os.path.exists(alt):
|
||||||
|
return alt
|
||||||
|
sys.exit(f"[ERROR] File not found: {p}")
|
||||||
|
|
||||||
|
print("[Load] AUM")
|
||||||
|
print("[Load] Flows")
|
||||||
|
aum, flows = load_data_diagnostics()
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n[Detect] Running broken-month detection (α={args.alpha:.1%}, lag=±{args.lag}d)..."
|
||||||
|
)
|
||||||
|
df_broken, df_all = detect_broken_months(
|
||||||
|
aum, flows, alpha=args.alpha, lag_days=args.lag
|
||||||
|
)
|
||||||
|
df_agg = detect_aggregate_broken_months(
|
||||||
|
aum, flows, alpha=args.alpha, lag_days=args.lag
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n[Error account] Building error account...")
|
||||||
|
df_err_isin, df_err_agg = build_error_account(aum, flows, lag_days=args.lag)
|
||||||
|
|
||||||
|
print_summary(df_broken, df_all, args.alpha)
|
||||||
|
|
||||||
|
n_agg_broken = int(df_agg["broken"].sum())
|
||||||
|
print(
|
||||||
|
f" Aggregate broken months : {n_agg_broken} "
|
||||||
|
f"(of which lags: {int(df_agg['is_lag'].sum())})"
|
||||||
|
)
|
||||||
|
max_err = float(df_err_agg["stock_error_agg"].abs().max())
|
||||||
|
print(
|
||||||
|
f" Max aggregate error stock : {max_err:,.1f} shares "
|
||||||
|
f"({float(df_err_agg['stock_error_agg_pct'].max()):.3f}% of total AUM)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# CSV output — this is what carmignac_repair.py loads
|
||||||
|
if len(df_broken):
|
||||||
|
df_broken.to_csv(args.out, index=False)
|
||||||
|
print(f"[Export] Broken months CSV → {args.out}")
|
||||||
|
else:
|
||||||
|
pd.DataFrame(columns=["date", "isin", "missing_pct", "is_lag"]).to_csv(
|
||||||
|
args.out, index=False
|
||||||
|
)
|
||||||
|
print(f"[Export] No broken months — empty CSV → {args.out}")
|
||||||
|
|
||||||
|
# Error account CSV
|
||||||
|
err_out = args.out.replace("broken_months", "error_account")
|
||||||
|
df_err_isin.to_csv(err_out, index=False)
|
||||||
|
err_agg_out = err_out.replace("error_account", "error_account_agg")
|
||||||
|
df_err_agg.to_csv(err_agg_out, index=False)
|
||||||
|
print(f"[Export] Error account (ISIN) → {err_out}")
|
||||||
|
print(f"[Export] Error account (agg) → {err_agg_out}")
|
||||||
|
|
||||||
|
html = build_html_diagnostics(
|
||||||
|
df_broken, df_all, df_agg, df_err_isin, df_err_agg, args.alpha
|
||||||
|
)
|
||||||
|
with open(args.html, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
print(f"[Export] HTML report → {args.html}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
995
src/repair_challenge/carmignac_repair.py
Normal file
995
src/repair_challenge/carmignac_repair.py
Normal file
|
|
@ -0,0 +1,995 @@
|
||||||
|
"""
|
||||||
|
Registrar ID Repair Pipeline
|
||||||
|
=========================================================
|
||||||
|
Étape 1 : Filtrage & univers de référence à t=31/10/2025
|
||||||
|
Étape 2 : Score de cohérence temporelle (propagation vers le passé)
|
||||||
|
Étape 3 : Chirurgie de code (matching 1-to-1)
|
||||||
|
|
||||||
|
À appliquer après le diagnostic de broken months
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from helpers import load_data_repair
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# PARAMÈTRES
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
ALPHA = 0.05 # tolérance réconciliation : 5% du stock à t
|
||||||
|
MIN_AUM_EUR = 5e6 # seuil filtrage étape 1
|
||||||
|
MIN_JACCARD = 0.3 # seuil minimal similarité portefeuille pour chirurgie
|
||||||
|
SCORE_DROP_THRESHOLD = 0.15 # si score chute de >15% → candidat chirurgie
|
||||||
|
MAX_SURGERY_LOOKBACK = 6 # remonter jusqu'à 6 mois en arrière pour trouver un candidat
|
||||||
|
SYMMETRY_ATTENUATION = (
|
||||||
|
0.05 # facteur d'atténuation si rupture symétrique détectée (cas 1/3)
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Broken months ──────────────────────────────────────────────
|
||||||
|
# Attenuation factor applied to reconciliation errors on months flagged
|
||||||
|
# as "broken" by carmignac_diagnostics.py. On a broken month the error
|
||||||
|
# is multiplied by this factor before degrading the score, so a genuine
|
||||||
|
# data-quality problem at market level does not unfairly penalise an
|
||||||
|
# account. Set to 1.0 to disable attenuation.
|
||||||
|
BROKEN_MONTH_ATTENUATION = 0.2 # reduce error to 20% on broken months
|
||||||
|
|
||||||
|
# ── Accounting lag window ──────────────────────────────────────
|
||||||
|
# Transactions dated within this many days of a month-end boundary are
|
||||||
|
# considered "boundary" flows. When the standard-window reconciliation
|
||||||
|
# fails but the lag-adjusted reconciliation passes, the error is
|
||||||
|
# attenuated (same factor as broken months).
|
||||||
|
LAG_ATTENUATION = 0.1 # reduce error to 10% on likely lag months
|
||||||
|
|
||||||
|
# ── Fenêtre de chirurgie étendue ───────────────────────────────
|
||||||
|
# Quand aucun bon candidat n'est trouvé à t-1, la chirurgie remonte
|
||||||
|
# jusqu'à MAX_SURGERY_LOOKBACK mois en arrière. Pour chaque mois k
|
||||||
|
# supplémentaire, le score composite est multiplié par un facteur de
|
||||||
|
# confiance décroissant : confidence(k) = 1 - (k-1)/MAX_SURGERY_LOOKBACK.
|
||||||
|
# Carmignac suggère 6 mois (délai maximal de résolution des transferts
|
||||||
|
# asymétriques, lié au cycle de paiement des rétrocessions trimestrielles).
|
||||||
|
MAX_SURGERY_LOOKBACK = 6
|
||||||
|
|
||||||
|
EXCLUDE_REGISTRAR = ["Off Distribution", "Private Clients"]
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# CHARGEMENT
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def load_broken_months(broken_months_path):
|
||||||
|
"""
|
||||||
|
Loads the broken-months CSV produced by carmignac_diagnostics.py.
|
||||||
|
Returns a set of (date, isin) tuples flagged as broken, and a
|
||||||
|
separate set flagged as likely accounting lags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not broken_months_path or not os.path.exists(broken_months_path):
|
||||||
|
print("Could not find the path")
|
||||||
|
return set(), set()
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(broken_months_path, parse_dates=["date"])
|
||||||
|
broken = set(zip(pd.to_datetime(df["date"]), df["isin"].astype(str)))
|
||||||
|
lags = set(
|
||||||
|
zip(
|
||||||
|
pd.to_datetime(df.loc[df["is_lag"], "date"]),
|
||||||
|
df.loc[df["is_lag"], "isin"].astype(str),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"[Broken months] Loaded {len(broken)} flagged (isin, month) pairs "
|
||||||
|
f"({len(lags)} likely lags)"
|
||||||
|
)
|
||||||
|
return broken, lags
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[Broken months] Could not load '{broken_months_path}': {e}")
|
||||||
|
return set(), set()
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# ÉTAPE 1 — Univers de référence à T_REF
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def build_reference_universe(aum, t_ref=None):
|
||||||
|
"""
|
||||||
|
Construit l'univers de référence à t_ref (dernière date par défaut).
|
||||||
|
Retourne :
|
||||||
|
- aum_ref : AUM à t_ref pour chaque (reg_id, isin)
|
||||||
|
- weights : poids normalisé par reg_id
|
||||||
|
- universe : ensemble des reg_id retenus (>= MIN_AUM_EUR)
|
||||||
|
"""
|
||||||
|
if t_ref is None:
|
||||||
|
t_ref = aum["date"].max()
|
||||||
|
|
||||||
|
print(f"\n[Étape 1] Date de référence : {t_ref.date()}")
|
||||||
|
|
||||||
|
# Exclure Off Distribution / Private Clients (sur région ou nom)
|
||||||
|
mask_excl = aum["reg_id"].isin(EXCLUDE_REGISTRAR)
|
||||||
|
if "region" in aum.columns:
|
||||||
|
mask_excl |= aum["region"].isin(EXCLUDE_REGISTRAR)
|
||||||
|
aum_clean = aum[~mask_excl].copy()
|
||||||
|
|
||||||
|
# AUM à t_ref
|
||||||
|
aum_ref = aum_clean[aum_clean["date"] == t_ref][
|
||||||
|
["reg_id", "isin", "qty_aum", "val_eur"]
|
||||||
|
].copy()
|
||||||
|
|
||||||
|
# AUM total par reg_id à t_ref
|
||||||
|
aum_by_reg = aum_ref.groupby("reg_id")["val_eur"].sum().rename("total_eur")
|
||||||
|
|
||||||
|
# Filtrage >= MIN_AUM_EUR
|
||||||
|
universe = set(aum_by_reg[aum_by_reg >= MIN_AUM_EUR].index)
|
||||||
|
|
||||||
|
total_eur_universe = aum_by_reg[aum_by_reg.index.isin(universe)].sum()
|
||||||
|
total_eur_all = aum_by_reg.sum()
|
||||||
|
coverage = total_eur_universe / total_eur_all if total_eur_all > 0 else 0
|
||||||
|
|
||||||
|
print(f" Registrar IDs à t_ref : {len(aum_by_reg)}")
|
||||||
|
print(f" Dont >= {MIN_AUM_EUR / 1e6:.0f}M€ : {len(universe)}")
|
||||||
|
print(f" Couverture encours : {coverage:.1%}")
|
||||||
|
|
||||||
|
# Poids initiaux (scores à t_ref)
|
||||||
|
weights = (
|
||||||
|
aum_by_reg[aum_by_reg.index.isin(universe)] / total_eur_universe
|
||||||
|
).to_dict()
|
||||||
|
|
||||||
|
return aum_ref, weights, universe, t_ref
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# 3. PANEL AUM MENSUEL (forward-fill)
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def build_monthly_panel(aum, universe, t_ref):
|
||||||
|
"""
|
||||||
|
Construit un panel mensuel complet (forward-fill des quantités AUM)
|
||||||
|
pour TOUS les reg_ids présents dans l'historique AUM — y compris les codes
|
||||||
|
historiques hors univers de référence, nécessaires pour la chirurgie.
|
||||||
|
"""
|
||||||
|
# Toutes les fin de mois entre la première date et t_ref
|
||||||
|
date_min = aum["date"].min()
|
||||||
|
all_months = pd.date_range(start=date_min, end=t_ref, freq="ME")
|
||||||
|
|
||||||
|
# Pivot : (reg_id, isin) → série temporelle de qty_aum
|
||||||
|
aum_sorted = aum.sort_values(["reg_id", "isin", "date"])
|
||||||
|
|
||||||
|
# On ne garde que les lignes jusqu'à t_ref
|
||||||
|
aum_sorted = aum_sorted[aum_sorted["date"] <= t_ref]
|
||||||
|
|
||||||
|
# Multi-index pivot
|
||||||
|
panel = aum_sorted.pivot_table(
|
||||||
|
index="date", columns=["reg_id", "isin"], values="qty_aum", aggfunc="last"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Réindexer sur toutes les fins de mois
|
||||||
|
panel = panel.reindex(all_months)
|
||||||
|
|
||||||
|
# Forward-fill : si pas de mouvement, la quantité reste la même
|
||||||
|
panel = panel.ffill()
|
||||||
|
|
||||||
|
# Backward-fill initial pour les comptes qui démarrent après la première date
|
||||||
|
# (on ne remonte pas avant leur première apparition → on garde NaN)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n[Panel mensuel] {len(all_months)} mois, {panel.shape[1]} (reg_id, isin) paires"
|
||||||
|
)
|
||||||
|
|
||||||
|
return panel, all_months
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# 4. FLOWS AGRÉGÉS PAR MOIS
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def aggregate_flows_monthly(flows, all_months, lag_days=3):
|
||||||
|
"""
|
||||||
|
Agrège les flows infra-mensuels sur chaque fenêtre ]fin_mois(t-1), fin_mois(t)].
|
||||||
|
Retourne deux DataFrames indexés par (fin_mois, reg_id, isin) :
|
||||||
|
- monthly_flows : agrégation standard (fenêtre exacte)
|
||||||
|
- monthly_flows_lag : agrégation avec fenêtre élargie de ±lag_days jours
|
||||||
|
autour de chaque fin de mois. Utilisé pour détecter
|
||||||
|
les ruptures dues à un décalage comptable de fin de mois.
|
||||||
|
"""
|
||||||
|
flows_f = flows[flows["date"] <= all_months[-1]].copy()
|
||||||
|
|
||||||
|
def assign_month(d, lower_offset=0, upper_offset=0):
|
||||||
|
for m in all_months:
|
||||||
|
eom_prev = m - pd.offsets.MonthEnd(1)
|
||||||
|
lo = eom_prev - pd.Timedelta(days=lower_offset)
|
||||||
|
hi = m + pd.Timedelta(days=upper_offset)
|
||||||
|
if lo < d <= hi:
|
||||||
|
return m
|
||||||
|
return pd.NaT
|
||||||
|
|
||||||
|
# Standard window
|
||||||
|
flows_f["month_end"] = flows_f["date"].apply(lambda d: assign_month(d))
|
||||||
|
flows_std = flows_f.dropna(subset=["month_end"])
|
||||||
|
monthly_flows = (
|
||||||
|
flows_std.groupby(["month_end", "reg_id", "isin"])["qty_net"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
monthly_flows.columns = ["date", "reg_id", "isin", "qty_net_month"]
|
||||||
|
|
||||||
|
# Lag window (±lag_days around each EOM)
|
||||||
|
flows_f2 = flows[flows["date"] <= all_months[-1]].copy()
|
||||||
|
flows_f2["month_end"] = flows_f2["date"].apply(
|
||||||
|
lambda d: assign_month(d, lower_offset=lag_days, upper_offset=lag_days)
|
||||||
|
)
|
||||||
|
flows_lag = flows_f2.dropna(subset=["month_end"])
|
||||||
|
monthly_flows_lag = (
|
||||||
|
flows_lag.groupby(["month_end", "reg_id", "isin"])["qty_net"]
|
||||||
|
.sum()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
monthly_flows_lag.columns = ["date", "reg_id", "isin", "qty_net_month"]
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n[Flows mensuels] {len(monthly_flows)} enregistrements (standard) | "
|
||||||
|
f"{len(monthly_flows_lag)} (lag window ±{lag_days}d)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return monthly_flows, monthly_flows_lag
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# ÉTAPE 2 — Score de cohérence temporelle
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def compute_reconciliation_error(qty_t_minus1, qty_t, net_flow, alpha=ALPHA):
|
||||||
|
"""
|
||||||
|
Calcule l'erreur de réconciliation normalisée pour un (reg_id, isin, mois).
|
||||||
|
|
||||||
|
Attendu : qty_t_minus1 + net_flow ≈ qty_t
|
||||||
|
Erreur : |qty_t_minus1 + net_flow - qty_t| / max(|qty_t|, |qty_t_minus1|)
|
||||||
|
|
||||||
|
Retourne :
|
||||||
|
- err_ratio : erreur relative (0 = parfait)
|
||||||
|
- is_break : True si err_ratio > alpha
|
||||||
|
"""
|
||||||
|
denom = max(abs(qty_t), abs(qty_t_minus1), 1e-9)
|
||||||
|
err = abs(qty_t_minus1 + net_flow - qty_t)
|
||||||
|
err_ratio = err / denom
|
||||||
|
return err_ratio, err_ratio > alpha
|
||||||
|
|
||||||
|
|
||||||
|
def score_propagation(
|
||||||
|
panel,
|
||||||
|
monthly_flows,
|
||||||
|
monthly_flows_lag,
|
||||||
|
weights,
|
||||||
|
universe,
|
||||||
|
all_months,
|
||||||
|
broken_months=None,
|
||||||
|
lag_months=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Propage les scores de t_ref vers t=0 (passé).
|
||||||
|
|
||||||
|
À chaque mois t (en remontant), pour chaque reg_id dans l'univers courant :
|
||||||
|
- Calculer l'erreur de réconciliation pondérée par ISIN
|
||||||
|
- Dégrader le score proportionnellement
|
||||||
|
- Atténuer l'erreur si le mois est flagué comme "broken" ou "lag"
|
||||||
|
|
||||||
|
broken_months : set of (date, isin) pairs flagged as broken by diagnostics
|
||||||
|
lag_months : subset of broken_months identified as likely accounting lags
|
||||||
|
|
||||||
|
Retourne :
|
||||||
|
- scores_history : dict {date → {reg_id → score}}
|
||||||
|
- errors_history : dict {date → {reg_id → err_pondérée}}
|
||||||
|
- mapping : dict {reg_id_original → reg_id_courant} (après chirurgie)
|
||||||
|
"""
|
||||||
|
broken_months = broken_months or set()
|
||||||
|
lag_months = lag_months or set()
|
||||||
|
|
||||||
|
# Initialisation
|
||||||
|
scores = dict(weights) # scores à t_ref
|
||||||
|
scores_history = {all_months[-1]: dict(scores)}
|
||||||
|
errors_history = {}
|
||||||
|
|
||||||
|
# Mapping actuel (identité au départ)
|
||||||
|
mapping = {r: r for r in universe}
|
||||||
|
|
||||||
|
# Flows indexés pour accès rapide
|
||||||
|
flows_idx = monthly_flows.set_index(["date", "reg_id", "isin"])["qty_net_month"]
|
||||||
|
flows_idx_lag = monthly_flows_lag.set_index(["date", "reg_id", "isin"])[
|
||||||
|
"qty_net_month"
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Pré-calcul des AUM agrégés par (isin, mois) pour détection de symétrie ──
|
||||||
|
# Pour chaque (isin, t), on calcule la somme des variations de stock par compte.
|
||||||
|
# Une rupture symétrique = un compte perd X parts sur un ISIN, un autre en gagne X.
|
||||||
|
# On détecte cela via le résidu net agrégé : si faible → symétrie probable.
|
||||||
|
# Structure : {(t_curr, isin) → {reg_id → delta_qty}}
|
||||||
|
# Calculé à la volée dans la boucle, pas en pré-calcul (trop mémoire pour 400 comptes).
|
||||||
|
|
||||||
|
# Remonter dans le temps
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_prev = all_months[i]
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
|
||||||
|
# ── Détection de ruptures symétriques à ce pas de temps ──────
|
||||||
|
# Pour chaque ISIN, calculer la variation de stock par compte.
|
||||||
|
# Si la somme des variations positives ≈ somme des variations négatives
|
||||||
|
# → il y a probablement compensation (cas 1 ou 3, pas de perte nette).
|
||||||
|
# On stocke pour chaque (reg_id, isin) si sa rupture est symétrique.
|
||||||
|
symmetric_breaks = set() # ensemble de (reg_id, isin) à atténuer
|
||||||
|
|
||||||
|
for reg in panel.columns.get_level_values(0):
|
||||||
|
for isin in panel[reg].columns:
|
||||||
|
q_t = panel[reg][isin].get(t_curr, np.nan)
|
||||||
|
q_prev = panel[reg][isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_t) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
f = flows_idx.loc[(t_curr, reg, isin)]
|
||||||
|
except KeyError:
|
||||||
|
f = 0.0
|
||||||
|
residual = (q_t - q_prev) - f
|
||||||
|
if abs(residual) < ALPHA * max(abs(q_t), abs(q_prev), 1e-9):
|
||||||
|
continue # pas de rupture sur ce compte/ISIN
|
||||||
|
|
||||||
|
# Agrégation par ISIN : si le résidu net agrégé est petit,
|
||||||
|
# les ruptures individuelles se compensent → symétrie.
|
||||||
|
isin_residuals = {}
|
||||||
|
isin_total_abs = {}
|
||||||
|
for reg in panel.columns.get_level_values(0):
|
||||||
|
for isin in panel[reg].columns:
|
||||||
|
q_t = panel[reg][isin].get(t_curr, np.nan)
|
||||||
|
q_prev = panel[reg][isin].get(t_prev, np.nan)
|
||||||
|
if pd.isna(q_t) or pd.isna(q_prev):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
f = flows_idx.loc[(t_curr, reg, isin)]
|
||||||
|
except KeyError:
|
||||||
|
f = 0.0
|
||||||
|
residual = (q_t - q_prev) - f
|
||||||
|
denom = max(abs(q_t), abs(q_prev), 1e-9)
|
||||||
|
err = abs(residual) / denom
|
||||||
|
if err < ALPHA:
|
||||||
|
continue
|
||||||
|
isin_residuals[isin] = isin_residuals.get(isin, 0.0) + residual
|
||||||
|
isin_total_abs[isin] = isin_total_abs.get(isin, 0.0) + abs(residual)
|
||||||
|
|
||||||
|
# Un ISIN est "symétrique" si le résidu net < 20% du résidu brut total
|
||||||
|
# (les erreurs individuelles s'annulent en grande partie)
|
||||||
|
symmetric_isins = set()
|
||||||
|
for isin, net in isin_residuals.items():
|
||||||
|
total = isin_total_abs.get(isin, 0.0)
|
||||||
|
if total > 0 and abs(net) / total < 0.20:
|
||||||
|
symmetric_isins.add(isin)
|
||||||
|
|
||||||
|
errors_at_t = {}
|
||||||
|
new_scores = {}
|
||||||
|
|
||||||
|
for reg_orig, reg_curr in mapping.items():
|
||||||
|
score_curr = scores.get(reg_orig, 0)
|
||||||
|
if score_curr == 0:
|
||||||
|
new_scores[reg_orig] = 0
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ISIN détenus par ce reg à t_curr (après mapping)
|
||||||
|
if reg_curr in panel.columns.get_level_values(0):
|
||||||
|
isin_list = panel[reg_curr].columns.tolist()
|
||||||
|
else:
|
||||||
|
# reg_curr n'existe pas du tout dans le panel → rupture totale
|
||||||
|
new_scores[reg_orig] = 0
|
||||||
|
errors_at_t[reg_orig] = 1.0
|
||||||
|
continue
|
||||||
|
|
||||||
|
total_aum_t = 0
|
||||||
|
weighted_err = 0
|
||||||
|
valid_isin_count = 0
|
||||||
|
all_nan_at_prev = True # détecte si le compte n'existait pas à t_prev
|
||||||
|
|
||||||
|
for isin in isin_list:
|
||||||
|
qty_t = panel[reg_curr][isin].get(t_curr, np.nan)
|
||||||
|
qty_t_prev = panel[reg_curr][isin].get(t_prev, np.nan)
|
||||||
|
|
||||||
|
if pd.isna(qty_t):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not pd.isna(qty_t_prev):
|
||||||
|
all_nan_at_prev = False
|
||||||
|
|
||||||
|
if pd.isna(qty_t_prev):
|
||||||
|
# ISIN existait à t_curr mais pas à t_prev → rupture sur cet ISIN
|
||||||
|
# On le traite comme une erreur maximale pondérée par son AUM
|
||||||
|
weight_isin = abs(qty_t)
|
||||||
|
weighted_err += 1.0 * weight_isin
|
||||||
|
total_aum_t += weight_isin
|
||||||
|
valid_isin_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if qty_t == 0 and qty_t_prev == 0:
|
||||||
|
continue
|
||||||
|
# Flow agrégé sur ]t_prev, t_curr]
|
||||||
|
try:
|
||||||
|
net_flow = flows_idx.loc[(t_curr, reg_curr, isin)]
|
||||||
|
except KeyError:
|
||||||
|
net_flow = 0.0
|
||||||
|
|
||||||
|
err_ratio, is_break = compute_reconciliation_error(
|
||||||
|
qty_t_prev, qty_t, net_flow, alpha=ALPHA
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Attenuation on broken / lag / symmetric months ───
|
||||||
|
# Priority: symmetric > broken > lag
|
||||||
|
if err_ratio > 0:
|
||||||
|
key = (t_curr, isin)
|
||||||
|
if isin in symmetric_isins:
|
||||||
|
# Rupture compensée à l'agrégé → cas 1 ou 3,
|
||||||
|
# pas de perte nette de données → atténuation forte
|
||||||
|
err_ratio = err_ratio * SYMMETRY_ATTENUATION
|
||||||
|
elif key in broken_months or key in lag_months:
|
||||||
|
# Try lag-window flow to distinguish lag vs genuine gap
|
||||||
|
try:
|
||||||
|
net_flow_lag = flows_idx_lag.loc[(t_curr, reg_curr, isin)]
|
||||||
|
except KeyError:
|
||||||
|
net_flow_lag = net_flow
|
||||||
|
err_lag, _ = compute_reconciliation_error(
|
||||||
|
qty_t_prev, qty_t, net_flow_lag, alpha=ALPHA
|
||||||
|
)
|
||||||
|
# Use whichever flow window gives the smaller error,
|
||||||
|
# then attenuate the result
|
||||||
|
best_err = min(err_ratio, err_lag)
|
||||||
|
attenuation = (
|
||||||
|
BROKEN_MONTH_ATTENUATION
|
||||||
|
if key in broken_months
|
||||||
|
else LAG_ATTENUATION
|
||||||
|
)
|
||||||
|
err_ratio = best_err * attenuation
|
||||||
|
|
||||||
|
# Pondération par AUM à t_curr
|
||||||
|
weight_isin = abs(qty_t)
|
||||||
|
weighted_err += err_ratio * weight_isin
|
||||||
|
total_aum_t += weight_isin
|
||||||
|
valid_isin_count += 1
|
||||||
|
|
||||||
|
if total_aum_t > 0 and valid_isin_count > 0:
|
||||||
|
avg_err = weighted_err / total_aum_t
|
||||||
|
else:
|
||||||
|
avg_err = 0.0
|
||||||
|
|
||||||
|
errors_at_t[reg_orig] = avg_err
|
||||||
|
|
||||||
|
# Dégradation du score : score(t-1) = score(t) * (1 - err_pondérée)
|
||||||
|
# Clippée entre 0 et score_curr
|
||||||
|
degradation = min(avg_err, 1.0)
|
||||||
|
new_scores[reg_orig] = score_curr * (1.0 - degradation)
|
||||||
|
|
||||||
|
scores = new_scores
|
||||||
|
scores_history[t_prev] = dict(scores)
|
||||||
|
errors_history[t_prev] = dict(errors_at_t)
|
||||||
|
|
||||||
|
total_score = sum(scores.values())
|
||||||
|
print(
|
||||||
|
f" {t_prev.date()} | Σ scores = {total_score:.4f} | "
|
||||||
|
f"Comptes actifs = {sum(1 for v in scores.values() if v > 0)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return scores_history, errors_history, mapping
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# ÉTAPE 3 — Chirurgie de code
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def jaccard_isin(set_a, set_b):
|
||||||
|
"""Coefficient de Jaccard entre deux ensembles d'ISIN."""
|
||||||
|
if not set_a or not set_b:
|
||||||
|
return 0.0
|
||||||
|
inter = len(set_a & set_b)
|
||||||
|
union = len(set_a | set_b)
|
||||||
|
return inter / union if union > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def find_best_candidate(
|
||||||
|
reg_orig,
|
||||||
|
reg_curr,
|
||||||
|
t_prev,
|
||||||
|
t_curr,
|
||||||
|
panel,
|
||||||
|
flows_idx,
|
||||||
|
all_regs_at_t_prev,
|
||||||
|
mapping_inv,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Pour un reg_id dont le score a fortement chuté, cherche le meilleur
|
||||||
|
candidat j à t_prev tel que :
|
||||||
|
- j n'est pas déjà mappé à un autre compte original
|
||||||
|
- Le portefeuille ISIN de j à t_prev est similaire à celui de reg_curr à t_curr
|
||||||
|
- La réconciliation est bonne
|
||||||
|
|
||||||
|
Retourne (best_candidate, best_score_composite) ou (None, 0)
|
||||||
|
"""
|
||||||
|
# ISIN du compte cible à t_curr
|
||||||
|
if reg_curr not in panel.columns.get_level_values(0):
|
||||||
|
return None, 0.0
|
||||||
|
|
||||||
|
isin_curr = set(
|
||||||
|
panel[reg_curr]
|
||||||
|
.columns[
|
||||||
|
panel[reg_curr].loc[t_curr].notna() & (panel[reg_curr].loc[t_curr] != 0)
|
||||||
|
]
|
||||||
|
.tolist()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not isin_curr:
|
||||||
|
return None, 0.0
|
||||||
|
|
||||||
|
best_candidate = None
|
||||||
|
best_composite = 0.0
|
||||||
|
|
||||||
|
for j in all_regs_at_t_prev:
|
||||||
|
# Ne pas réutiliser un code déjà mappé
|
||||||
|
if j in mapping_inv:
|
||||||
|
continue
|
||||||
|
# Ne pas mapper sur soi-même si déjà présent
|
||||||
|
if j == reg_curr:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if j not in panel.columns.get_level_values(0):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ISIN de j à t_prev
|
||||||
|
col_j = panel[j]
|
||||||
|
isin_j = (
|
||||||
|
set(
|
||||||
|
col_j.columns[
|
||||||
|
col_j.loc[t_prev].notna() & (col_j.loc[t_prev] != 0)
|
||||||
|
].tolist()
|
||||||
|
)
|
||||||
|
if t_prev in col_j.index
|
||||||
|
else set()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not isin_j:
|
||||||
|
continue
|
||||||
|
|
||||||
|
jac = jaccard_isin(isin_curr, isin_j)
|
||||||
|
if jac < MIN_JACCARD:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Erreur de réconciliation pour les ISIN communs
|
||||||
|
common_isin = isin_curr & isin_j
|
||||||
|
total_aum = 0
|
||||||
|
weighted_err = 0
|
||||||
|
|
||||||
|
for isin in common_isin:
|
||||||
|
qty_t = (
|
||||||
|
panel[reg_curr][isin].get(t_curr, np.nan)
|
||||||
|
if isin in panel[reg_curr].columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
qty_t_prev = (
|
||||||
|
panel[j][isin].get(t_prev, np.nan)
|
||||||
|
if isin in panel[j].columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
|
||||||
|
if pd.isna(qty_t) or pd.isna(qty_t_prev):
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
net_flow = flows_idx.loc[(t_curr, j, isin)]
|
||||||
|
except KeyError:
|
||||||
|
net_flow = 0.0
|
||||||
|
|
||||||
|
err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
|
||||||
|
weight_isin = abs(qty_t)
|
||||||
|
weighted_err += err_ratio * weight_isin
|
||||||
|
total_aum += weight_isin
|
||||||
|
|
||||||
|
avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
|
||||||
|
|
||||||
|
composite = jac * (1.0 - min(avg_err, 1.0))
|
||||||
|
|
||||||
|
if composite > best_composite:
|
||||||
|
best_composite = composite
|
||||||
|
best_candidate = j
|
||||||
|
|
||||||
|
return best_candidate, best_composite
|
||||||
|
|
||||||
|
|
||||||
|
def _recompute_score_with_candidate(
|
||||||
|
reg_orig, candidate, t_prev, t_curr, panel, flows_idx, score_curr
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Recalcule proprement l'erreur de réconciliation pour un candidat donné,
|
||||||
|
et retourne le score après chirurgie.
|
||||||
|
"""
|
||||||
|
if candidate not in panel.columns.get_level_values(0):
|
||||||
|
return score_curr * 0 # candidat inexistant
|
||||||
|
|
||||||
|
isin_list_cand = panel[candidate].columns.tolist()
|
||||||
|
isin_list_curr = (
|
||||||
|
panel[reg_orig].columns.tolist()
|
||||||
|
if reg_orig in panel.columns.get_level_values(0)
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
|
||||||
|
total_aum = 0
|
||||||
|
weighted_err = 0
|
||||||
|
|
||||||
|
for isin in isin_list_curr:
|
||||||
|
qty_t = (
|
||||||
|
panel[reg_orig][isin].get(t_curr, np.nan)
|
||||||
|
if isin in panel[reg_orig].columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
if pd.isna(qty_t) or qty_t == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
qty_t_prev = (
|
||||||
|
panel[candidate][isin].get(t_prev, np.nan)
|
||||||
|
if isin in panel[candidate].columns
|
||||||
|
else np.nan
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
net_flow = flows_idx.loc[(t_curr, candidate, isin)]
|
||||||
|
except KeyError:
|
||||||
|
net_flow = 0.0
|
||||||
|
|
||||||
|
if pd.isna(qty_t_prev):
|
||||||
|
err_ratio = 1.0
|
||||||
|
else:
|
||||||
|
err_ratio, _ = compute_reconciliation_error(qty_t_prev, qty_t, net_flow)
|
||||||
|
|
||||||
|
weight_isin = abs(qty_t)
|
||||||
|
weighted_err += err_ratio * weight_isin
|
||||||
|
total_aum += weight_isin
|
||||||
|
|
||||||
|
avg_err = weighted_err / total_aum if total_aum > 0 else 1.0
|
||||||
|
return score_curr * (1.0 - min(avg_err, 1.0))
|
||||||
|
|
||||||
|
|
||||||
|
def run_surgery_pass(
|
||||||
|
scores_history,
|
||||||
|
errors_history,
|
||||||
|
panel,
|
||||||
|
monthly_flows,
|
||||||
|
monthly_flows_lag,
|
||||||
|
weights,
|
||||||
|
universe,
|
||||||
|
all_months,
|
||||||
|
broken_months=None,
|
||||||
|
lag_months=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Deuxième passe : pour chaque mois avec des ruptures fortes,
|
||||||
|
tente une chirurgie de code et recalcule les scores.
|
||||||
|
|
||||||
|
Corrections par rapport à la passe naïve :
|
||||||
|
- Après chirurgie, le score est recalculé proprement (pas juste composite)
|
||||||
|
- Le mapping propagé en arrière utilise le bon code à chaque étape
|
||||||
|
- Pré-filtre ISIN pour performance sur grand dataset
|
||||||
|
|
||||||
|
Retourne :
|
||||||
|
- mapping_history : {date → {reg_orig → reg_used}}
|
||||||
|
- surgery_log : liste des opérations effectuées
|
||||||
|
- scores_final : scores au dernier mois
|
||||||
|
"""
|
||||||
|
flows_idx = monthly_flows.set_index(["date", "reg_id", "isin"])["qty_net_month"]
|
||||||
|
flows_idx_lag = monthly_flows_lag.set_index(["date", "reg_id", "isin"])[
|
||||||
|
"qty_net_month"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Tous les reg_ids présents dans le panel (univers + codes historiques)
|
||||||
|
all_regs_in_panel = set(panel.columns.get_level_values(0))
|
||||||
|
|
||||||
|
# Pré-calcul : ensemble d'ISIN par reg_id à chaque date (pour pré-filtre rapide)
|
||||||
|
# {reg_id → {date → set(isin)}}
|
||||||
|
reg_isin_at_date = {}
|
||||||
|
for reg in all_regs_in_panel:
|
||||||
|
reg_isin_at_date[reg] = {}
|
||||||
|
col = panel[reg]
|
||||||
|
for date in col.index:
|
||||||
|
active = set(
|
||||||
|
col.columns[(col.loc[date].notna()) & (col.loc[date] != 0)].tolist()
|
||||||
|
)
|
||||||
|
if active:
|
||||||
|
reg_isin_at_date[reg][date] = active
|
||||||
|
|
||||||
|
# Mapping courant : reg_orig → reg_used
|
||||||
|
mapping = {r: r for r in universe}
|
||||||
|
mapping_inv = {r: r for r in universe}
|
||||||
|
|
||||||
|
surgery_log = []
|
||||||
|
mapping_history = {all_months[-1]: dict(mapping)}
|
||||||
|
scores_history_corrected = {all_months[-1]: dict(weights)}
|
||||||
|
|
||||||
|
# Scores courants (initialisés à t_ref)
|
||||||
|
scores = dict(weights)
|
||||||
|
|
||||||
|
for i in range(len(all_months) - 2, -1, -1):
|
||||||
|
t_prev = all_months[i]
|
||||||
|
t_curr = all_months[i + 1]
|
||||||
|
|
||||||
|
new_scores = {}
|
||||||
|
new_mapping = {}
|
||||||
|
|
||||||
|
for reg_orig in list(mapping.keys()):
|
||||||
|
reg_curr = mapping[reg_orig]
|
||||||
|
score_curr = scores.get(reg_orig, 0)
|
||||||
|
|
||||||
|
if score_curr == 0:
|
||||||
|
new_scores[reg_orig] = 0
|
||||||
|
new_mapping[reg_orig] = reg_curr
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Erreur sans chirurgie (depuis étape 2)
|
||||||
|
err = errors_history.get(t_prev, {}).get(reg_orig, 0.0)
|
||||||
|
score_prev_no_surgery = score_curr * (1.0 - min(err, 1.0))
|
||||||
|
drop_ratio = (
|
||||||
|
1.0 - (score_prev_no_surgery / score_curr) if score_curr > 0 else 0
|
||||||
|
)
|
||||||
|
|
||||||
|
if drop_ratio > SCORE_DROP_THRESHOLD:
|
||||||
|
# ── ISIN du compte courant à t_curr (pour pré-filtre) ──
|
||||||
|
isin_curr = reg_isin_at_date.get(reg_curr, {}).get(t_curr, set())
|
||||||
|
|
||||||
|
# ── Candidats disponibles ──
|
||||||
|
# On exclut les codes déjà mappés à un autre compte,
|
||||||
|
# mais reg_curr lui-même est un candidat valide (self-mapping).
|
||||||
|
available = (all_regs_in_panel - set(mapping_inv.keys())) | {reg_curr}
|
||||||
|
|
||||||
|
best_candidate = None
|
||||||
|
best_score_after = score_prev_no_surgery # baseline = pas de chirurgie
|
||||||
|
best_composite = 0.0
|
||||||
|
best_lookback = 0 # nombre de mois remontés pour trouver ce candidat
|
||||||
|
|
||||||
|
# ── Fenêtre de recherche étendue : jusqu'à MAX_SURGERY_LOOKBACK mois ──
|
||||||
|
# On cherche d'abord à t-1 (k=1), puis t-2 … t-MAX si rien trouvé.
|
||||||
|
# La confiance décroît avec la distance : confidence(k) = 1 - (k-1)/MAX
|
||||||
|
for k in range(1, MAX_SURGERY_LOOKBACK + 1):
|
||||||
|
if i - (k - 1) < 0:
|
||||||
|
break # on a atteint le début de l'historique
|
||||||
|
t_lookup = all_months[
|
||||||
|
i - (k - 1)
|
||||||
|
] # date candidate = t_prev - (k-1)
|
||||||
|
confidence = 1.0 - (k - 1) / MAX_SURGERY_LOOKBACK
|
||||||
|
|
||||||
|
for j in available:
|
||||||
|
# Pré-filtre rapide : overlap ISIN minimal
|
||||||
|
isin_j = reg_isin_at_date.get(j, {}).get(t_lookup, set())
|
||||||
|
if not isin_curr or not isin_j:
|
||||||
|
continue
|
||||||
|
inter = len(isin_curr & isin_j)
|
||||||
|
if inter == 0:
|
||||||
|
continue
|
||||||
|
jac = inter / len(isin_curr | isin_j)
|
||||||
|
if jac < MIN_JACCARD:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Score après chirurgie avec ce candidat à t_lookup
|
||||||
|
# (on utilise t_curr comme référence de stock, t_lookup comme prior)
|
||||||
|
score_after_raw = _recompute_score_with_candidate(
|
||||||
|
reg_curr, j, t_lookup, t_curr, panel, flows_idx, score_curr
|
||||||
|
)
|
||||||
|
# Appliquer le facteur de confiance lié à la distance temporelle
|
||||||
|
score_after = (
|
||||||
|
score_curr * confidence * (score_after_raw / score_curr)
|
||||||
|
if score_curr > 0
|
||||||
|
else score_after_raw
|
||||||
|
)
|
||||||
|
composite = (
|
||||||
|
jac * confidence * (score_after_raw / score_curr)
|
||||||
|
if score_curr > 0
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
|
||||||
|
if score_after > best_score_after:
|
||||||
|
best_score_after = score_after
|
||||||
|
best_candidate = j
|
||||||
|
best_composite = composite
|
||||||
|
best_lookback = k
|
||||||
|
|
||||||
|
# Si on a trouvé un bon candidat à cette distance, on s'arrête
|
||||||
|
if best_candidate is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
if best_candidate:
|
||||||
|
lookback_note = (
|
||||||
|
f", lookback={best_lookback}m" if best_lookback > 1 else ""
|
||||||
|
)
|
||||||
|
surgery_log.append(
|
||||||
|
{
|
||||||
|
"date": t_prev,
|
||||||
|
"reg_orig": reg_orig,
|
||||||
|
"reg_from": reg_curr,
|
||||||
|
"reg_to": best_candidate,
|
||||||
|
"jaccard_composite": round(best_composite, 4),
|
||||||
|
"score_before": round(score_curr, 6),
|
||||||
|
"score_after": round(best_score_after, 6),
|
||||||
|
"drop_without_surgery": round(drop_ratio, 4),
|
||||||
|
"gain_vs_no_surgery": round(
|
||||||
|
best_score_after - score_prev_no_surgery, 6
|
||||||
|
),
|
||||||
|
"lookback_months": best_lookback,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" 🔧 CHIRURGIE {t_prev.date()} | {reg_orig} : "
|
||||||
|
f"{reg_curr} → {best_candidate} "
|
||||||
|
f"(composite={best_composite:.3f}, "
|
||||||
|
f"score {score_curr:.4f}→{best_score_after:.4f}"
|
||||||
|
f"{lookback_note})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mise à jour mapping
|
||||||
|
if best_candidate != reg_curr:
|
||||||
|
if reg_curr in mapping_inv:
|
||||||
|
del mapping_inv[reg_curr]
|
||||||
|
mapping_inv[best_candidate] = reg_orig
|
||||||
|
new_mapping[reg_orig] = best_candidate
|
||||||
|
new_scores[reg_orig] = best_score_after
|
||||||
|
else:
|
||||||
|
new_mapping[reg_orig] = reg_curr
|
||||||
|
new_scores[reg_orig] = score_prev_no_surgery
|
||||||
|
else:
|
||||||
|
new_mapping[reg_orig] = reg_curr
|
||||||
|
new_scores[reg_orig] = score_prev_no_surgery
|
||||||
|
|
||||||
|
mapping = new_mapping
|
||||||
|
mapping_inv = {v: k for k, v in mapping.items()}
|
||||||
|
scores = new_scores
|
||||||
|
mapping_history[t_prev] = dict(mapping)
|
||||||
|
scores_history_corrected[t_prev] = dict(scores)
|
||||||
|
|
||||||
|
total_score = sum(s for s in scores.values() if not np.isnan(s))
|
||||||
|
n_surgeries = sum(1 for op in surgery_log if op["date"] == t_prev)
|
||||||
|
print(
|
||||||
|
f" {t_prev.date()} | Σ scores = {total_score:.4f} | "
|
||||||
|
f"Chirurgies = {n_surgeries}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return mapping_history, surgery_log, scores, scores_history_corrected
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# EXPORT RÉSULTATS
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def export_results(
|
||||||
|
scores_history, mapping_history, surgery_log, all_months, out_prefix="carmignac"
|
||||||
|
):
|
||||||
|
"""Exporte les résultats clés en CSV."""
|
||||||
|
|
||||||
|
# Score history
|
||||||
|
rows = []
|
||||||
|
for date, sc in scores_history.items():
|
||||||
|
for reg, score in sc.items():
|
||||||
|
rows.append({"date": date, "reg_id": reg, "score": score})
|
||||||
|
df_scores = (
|
||||||
|
pd.DataFrame(rows)
|
||||||
|
if rows
|
||||||
|
else pd.DataFrame(columns=["date", "reg_id", "score"])
|
||||||
|
)
|
||||||
|
if not df_scores.empty:
|
||||||
|
df_scores = df_scores.sort_values(["date", "score"], ascending=[True, False])
|
||||||
|
df_scores.to_csv(
|
||||||
|
f"repair_challenge/repair_results/{out_prefix}_scores.csv", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mapping history
|
||||||
|
rows_m = []
|
||||||
|
for date, mp in mapping_history.items():
|
||||||
|
for reg_orig, reg_used in mp.items():
|
||||||
|
rows_m.append(
|
||||||
|
{
|
||||||
|
"date": date,
|
||||||
|
"reg_orig": reg_orig,
|
||||||
|
"reg_used": reg_used,
|
||||||
|
"changed": reg_orig != reg_used,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
df_mapping = (
|
||||||
|
pd.DataFrame(rows_m)
|
||||||
|
if rows_m
|
||||||
|
else pd.DataFrame(columns=["date", "reg_orig", "reg_used", "changed"])
|
||||||
|
)
|
||||||
|
if not df_mapping.empty:
|
||||||
|
df_mapping = df_mapping.sort_values(["date", "reg_orig"])
|
||||||
|
df_mapping.to_csv(
|
||||||
|
f"repair_challenge/repair_results/{out_prefix}_mapping.csv", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Surgery log
|
||||||
|
if surgery_log:
|
||||||
|
df_surgery = pd.DataFrame(surgery_log).sort_values("date")
|
||||||
|
df_surgery.to_csv(
|
||||||
|
f"repair_challenge/repair_results/{out_prefix}_surgery_log.csv", index=False
|
||||||
|
)
|
||||||
|
print(f"\n[Export] {len(surgery_log)} opérations de chirurgie sauvegardées.")
|
||||||
|
else:
|
||||||
|
print("\n[Export] Aucune chirurgie effectuée sur ce subset.")
|
||||||
|
|
||||||
|
print(f"[Export] Scores → {out_prefix}_scores.csv")
|
||||||
|
print(f"[Export] Mapping → {out_prefix}_mapping.csv")
|
||||||
|
|
||||||
|
return df_scores, df_mapping
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# PIPELINE PRINCIPAL
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
def run_pipeline(
|
||||||
|
broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv",
|
||||||
|
):
|
||||||
|
print("=" * 60)
|
||||||
|
print("CARMIGNAC — Pipeline de réparation des Registrar IDs")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Chargement
|
||||||
|
aum, flows = load_data_repair()
|
||||||
|
|
||||||
|
# Broken months (optional — produced by carmignac_diagnostics.py)
|
||||||
|
broken_months, lag_months = load_broken_months(broken_months_path)
|
||||||
|
|
||||||
|
# Étape 1 — Univers de référence
|
||||||
|
aum_ref, weights, universe, t_ref = build_reference_universe(aum)
|
||||||
|
|
||||||
|
print("\n Top 5 comptes par poids :")
|
||||||
|
for reg, w in sorted(weights.items(), key=lambda x: -x[1])[:5]:
|
||||||
|
print(f" {reg} : {w:.4f} ({w * 100:.2f}%)")
|
||||||
|
|
||||||
|
# Panel mensuel
|
||||||
|
panel, all_months = build_monthly_panel(aum, universe, t_ref)
|
||||||
|
|
||||||
|
# Flows mensuels agrégés (standard + lag window)
|
||||||
|
monthly_flows, monthly_flows_lag = aggregate_flows_monthly(flows, all_months)
|
||||||
|
|
||||||
|
# Étape 2 — Score de cohérence (sans chirurgie)
|
||||||
|
print("\n[Étape 2] Propagation des scores (sans chirurgie)...")
|
||||||
|
scores_history, errors_history, _ = score_propagation(
|
||||||
|
panel,
|
||||||
|
monthly_flows,
|
||||||
|
monthly_flows_lag,
|
||||||
|
weights,
|
||||||
|
universe,
|
||||||
|
all_months,
|
||||||
|
broken_months=broken_months,
|
||||||
|
lag_months=lag_months,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Étape 3 — Chirurgie
|
||||||
|
print("\n[Étape 3] Passe de chirurgie...")
|
||||||
|
mapping_history, surgery_log, final_scores, scores_history_corrected = (
|
||||||
|
run_surgery_pass(
|
||||||
|
scores_history,
|
||||||
|
errors_history,
|
||||||
|
panel,
|
||||||
|
monthly_flows,
|
||||||
|
monthly_flows_lag,
|
||||||
|
weights,
|
||||||
|
universe,
|
||||||
|
all_months,
|
||||||
|
broken_months=broken_months,
|
||||||
|
lag_months=lag_months,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Export — on utilise les scores corrigés (post-chirurgie) comme référence
|
||||||
|
print("\n[Export des résultats...]")
|
||||||
|
df_scores, df_mapping = export_results(
|
||||||
|
scores_history_corrected, mapping_history, surgery_log, all_months
|
||||||
|
)
|
||||||
|
|
||||||
|
# Résumé final
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("RÉSUMÉ FINAL")
|
||||||
|
print("=" * 60)
|
||||||
|
print(
|
||||||
|
f" Dates couvertes : {all_months[0].date()} → {all_months[-1].date()}"
|
||||||
|
)
|
||||||
|
print(f" Comptes dans l'univers : {len(universe)}")
|
||||||
|
print(f" Chirurgies effectuées : {len(surgery_log)}")
|
||||||
|
score_by_date = {
|
||||||
|
d: sum(s for s in sc.values() if s == s)
|
||||||
|
for d, sc in scores_history_corrected.items()
|
||||||
|
}
|
||||||
|
print(f" Σ scores à t_ref : {score_by_date[t_ref]:.4f}")
|
||||||
|
print(f" Σ scores à t_min : {score_by_date[all_months[0]]:.4f}")
|
||||||
|
|
||||||
|
return df_scores, df_mapping, surgery_log, scores_history_corrected, mapping_history
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
df_scores, df_mapping, surgery_log, scores_history, mapping_history = run_pipeline(
|
||||||
|
broken_months_path="repair_challenge/alpha_5%/carmignac_broken_months.csv" # optional
|
||||||
|
)
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
2329
src/repair_challenge/helpers.py
Normal file
2329
src/repair_challenge/helpers.py
Normal file
File diff suppressed because it is too large
Load Diff
12319
src/repair_challenge/repair_results/carmignac_report.html
Normal file
12319
src/repair_challenge/repair_results/carmignac_report.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user