2988 lines
161 KiB
Plaintext
2988 lines
161 KiB
Plaintext
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 1,
|
|||
|
|
"id": "29f7e620-7b04-45f6-ac87-f17505f140c3",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"import plotly.graph_objects as go\n",
|
|||
|
|
"import matplotlib.pyplot as plt"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 6,
|
|||
|
|
"id": "a48ad016-e4f2-40d9-a607-344a316f5f02",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\n",
|
|||
|
|
"KeyboardInterrupt\n",
|
|||
|
|
"\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"stocks = pd.read_csv(\"stocks.csv\")\n",
|
|||
|
|
"flows = pd.read_csv(\"flows.csv\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 11,
|
|||
|
|
"id": "221a4c7b-0f50-431a-875b-ad40bed7f0ac",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import os\n",
|
|||
|
|
"import s3fs\n",
|
|||
|
|
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N0C5PK75FDX2WXI8OVP1'\n",
|
|||
|
|
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'nZvC2urUkG7EvhDsFDyaOslqr160aoWMs+5MP3Ft'\n",
|
|||
|
|
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMEM1UEs3NUZEWDJXWEk4T1ZQMSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc3MzIyNzI3OCwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NzQ0MzY4OTksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc3MzIyNzI5OSwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiI5Mjc0ODgyMy04OTgzLTQzYjktYTZhNy0xYjhlNDdiOTRjNTUiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjRkODM3NWVmLTQwY2QtNDYyMi05NzIyLTI4YjhjZTQ2MWQ5YyIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.hl_SekvaH9A22PMb3W0VQBSNO67LnaneIuLC-X5XBnzOO5GLV61aocDRfYC6hvVVhdzyewSTtD2kvdyJdeu6qA'\n",
|
|||
|
|
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
|
|||
|
|
"fs = s3fs.S3FileSystem(\n",
|
|||
|
|
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
|||
|
|
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
|||
|
|
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
|||
|
|
" token = os.environ[\"AWS_SESSION_TOKEN\"])\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 9,
|
|||
|
|
"id": "87505949-ecd8-4fad-a19b-d29130be587e",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Index(['Registrar Account - ID', 'Product - Isin', 'Centralisation Date',\n",
|
|||
|
|
" 'Quantity - AUM', 'corrected_aum', 'repair_flag'],\n",
|
|||
|
|
" dtype='object')\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"print(stocks.columns)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "3c6d9d05-b203-49ae-869f-7f85ead2c69e",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"keys = [\n",
|
|||
|
|
" \"Registrar Account - ID\",\n",
|
|||
|
|
" \"Product - Isin\",\n",
|
|||
|
|
" \"Centralisation Date\"\n",
|
|||
|
|
"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks = stocks[keys + [\"Quantity - AUM\"]]\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = flows[keys + [\"Quantity - NetFlows\"]]\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = (\n",
|
|||
|
|
" flows\n",
|
|||
|
|
" .groupby(keys, as_index=False)\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "d30c2235-281b-41a6-828b-abb6fcfc4183",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"df = stocks.merge(flows, on=keys, how=\"left\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(keys)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# REBUILD ACCOUNTING IDENTITY\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = df.groupby(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
|
|||
|
|
")[\"Quantity - AUM\"].shift(1)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_flow\"] = df.groupby(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\"]\n",
|
|||
|
|
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# GAP ANALYSIS\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
|
|||
|
|
"\n",
|
|||
|
|
"EPS = 10\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"rupture_flag\"] = (\n",
|
|||
|
|
" df[\"prev_aum\"].notna()\n",
|
|||
|
|
" & (df[\"gap_abs\"] > EPS)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# PARAMETERS\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"GAP_TOL = 1e-6\n",
|
|||
|
|
"REL_GAP_THR = 0.05\n",
|
|||
|
|
"MIN_PERSISTENCE = 3\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SORT DATA\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"corrected_aum\"] = df[\"Quantity - AUM\"]\n",
|
|||
|
|
"df[\"repair_flag\"] = False\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# REBUILD EXPECTED AUM BEFORE REPAIR\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_flow\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_stock\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"#delete negative AUM\n",
|
|||
|
|
"df = df[df[\"Quantity - AUM\"] >= 0]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "efd374d0-6393-45f2-926e-2c29249cd078",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def repair_group(g):\n",
|
|||
|
|
"\n",
|
|||
|
|
" g = g.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" obs = g[\"Quantity - AUM\"].values\n",
|
|||
|
|
" flows = g[\"Quantity - NetFlows\"].values\n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected = obs.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Build expected AUM path\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" expected = np.empty_like(obs)\n",
|
|||
|
|
" expected[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" gap = obs - expected\n",
|
|||
|
|
"\n",
|
|||
|
|
" rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" idx = None\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Detect persistent shift\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" for i in range(1, len(obs) - MIN_PERSISTENCE):\n",
|
|||
|
|
"\n",
|
|||
|
|
" if (\n",
|
|||
|
|
" rel_gap[i] > REL_GAP_THR\n",
|
|||
|
|
" and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
|
|||
|
|
" and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
|
|||
|
|
" ):\n",
|
|||
|
|
" idx = i\n",
|
|||
|
|
" break\n",
|
|||
|
|
"\n",
|
|||
|
|
" if idx is None:\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Compute shift\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" shift = gap[idx]\n",
|
|||
|
|
"\n",
|
|||
|
|
" candidate = obs[idx:] - shift\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # SAFETY CHECKS\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" # 1. do not allow negative AUM\n",
|
|||
|
|
" # refuse repair if it creates NEW negative AUM\n",
|
|||
|
|
" if ((candidate < 0) & (obs[idx:] >= 0)).any():\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" # 2. avoid extreme corrections\n",
|
|||
|
|
" if abs(shift) > 2 * np.nanmax(np.abs(obs)):\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Apply correction\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected[idx:] = candidate\n",
|
|||
|
|
"\n",
|
|||
|
|
" g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Rebuild expected path after repair\n",
|
|||
|
|
" \n",
|
|||
|
|
"\n",
|
|||
|
|
" expected_corr = np.empty_like(obs)\n",
|
|||
|
|
" expected_corr[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected_corr[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" g[\"corrected_aum\"] = corrected\n",
|
|||
|
|
" g[\"expected_stock_corr\"] = expected_corr\n",
|
|||
|
|
"\n",
|
|||
|
|
" return g"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "fe1f869c-0a00-47e0-9355-3705b23561c7",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def repair_group(g):\n",
|
|||
|
|
"\n",
|
|||
|
|
" g = g.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" obs = g[\"Quantity - AUM\"].values\n",
|
|||
|
|
" flows = g[\"Quantity - NetFlows\"].values\n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected = obs.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Build expected AUM path\n",
|
|||
|
|
" expected = np.empty_like(obs)\n",
|
|||
|
|
" expected[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" gap = obs - expected\n",
|
|||
|
|
" rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" idx = None\n",
|
|||
|
|
" shift = None\n",
|
|||
|
|
"\n",
|
|||
|
|
" for i in range(1, len(obs) - MIN_PERSISTENCE - 1):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ------------------------------------------------\n",
|
|||
|
|
" # CASE 1 — standard persistent shift (original algo)\n",
|
|||
|
|
" # ------------------------------------------------\n",
|
|||
|
|
" if (\n",
|
|||
|
|
" rel_gap[i] > REL_GAP_THR\n",
|
|||
|
|
" and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
|
|||
|
|
" and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
|
|||
|
|
" ):\n",
|
|||
|
|
" idx = i\n",
|
|||
|
|
" shift = gap[i]\n",
|
|||
|
|
" break\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ------------------------------------------------\n",
|
|||
|
|
" # CASE 2 — double shift\n",
|
|||
|
|
" # ------------------------------------------------\n",
|
|||
|
|
" if (\n",
|
|||
|
|
" rel_gap[i] > REL_GAP_THR\n",
|
|||
|
|
" and rel_gap[i+1] > REL_GAP_THR\n",
|
|||
|
|
" and np.all(np.abs(gap[i+1:i+1+MIN_PERSISTENCE] - gap[i+1]) < GAP_TOL)\n",
|
|||
|
|
" and np.all(np.abs(np.diff(flows[i+1:i+1+MIN_PERSISTENCE])) < GAP_TOL)\n",
|
|||
|
|
" ):\n",
|
|||
|
|
" idx = i\n",
|
|||
|
|
" shift = gap[i+1]\n",
|
|||
|
|
" break\n",
|
|||
|
|
"\n",
|
|||
|
|
" if idx is None:\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Apply shift\n",
|
|||
|
|
" candidate = obs[idx:] - shift\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Safety checks\n",
|
|||
|
|
"\n",
|
|||
|
|
" # avoid creating new negative AUM\n",
|
|||
|
|
" if ((candidate < 0) & (obs[idx:] >= 0)).any():\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" # avoid extreme corrections\n",
|
|||
|
|
" if abs(shift) > 2 * np.nanmax(np.abs(obs)):\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected[idx:] = candidate\n",
|
|||
|
|
"\n",
|
|||
|
|
" g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
|
|||
|
|
"\n",
|
|||
|
|
" # rebuild expected path after repair\n",
|
|||
|
|
" expected_corr = np.empty_like(obs)\n",
|
|||
|
|
" expected_corr[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected_corr[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" g[\"corrected_aum\"] = corrected\n",
|
|||
|
|
" g[\"expected_stock_corr\"] = expected_corr\n",
|
|||
|
|
"\n",
|
|||
|
|
" return g"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 12,
|
|||
|
|
"id": "1ca2a5ab-354f-49af-b1aa-75c93d48de06",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"/tmp/ipykernel_5465/2911292439.py:10: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|||
|
|
" stocks = pd.read_csv(f, sep=\";\")\n",
|
|||
|
|
"/tmp/ipykernel_5465/2911292439.py:16: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|||
|
|
" flows = pd.read_csv(f, sep=\";\")\n",
|
|||
|
|
"/tmp/ipykernel_5465/2911292439.py:127: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
|||
|
|
" .apply(repair_group)\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"stock_repaired.csv successfully created\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"#FULL STOCK REPAIR\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# LOAD DATA\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"with fs.open('projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
|
|||
|
|
" stocks = pd.read_csv(f, sep=\";\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"with fs.open(\n",
|
|||
|
|
" \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\n",
|
|||
|
|
" \"rb\"\n",
|
|||
|
|
") as f:\n",
|
|||
|
|
" flows = pd.read_csv(f, sep=\";\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# MERGE FLOWS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# PARAMETERS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"REL_GAP_THR = 0.3\n",
|
|||
|
|
"MIN_PERSISTENCE = 3\n",
|
|||
|
|
"GAP_TOL = 1e-6\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# REPAIR FUNCTION\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"def repair_group(g):\n",
|
|||
|
|
"\n",
|
|||
|
|
" g = g.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" obs = g[\"Quantity - AUM\"].values\n",
|
|||
|
|
" flows = g[\"Quantity - NetFlows\"].values\n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected = obs.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" expected = np.empty_like(obs)\n",
|
|||
|
|
" expected[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" gap = obs - expected\n",
|
|||
|
|
" rel_gap = np.abs(gap) / np.maximum(np.abs(expected), 1.0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" idx = None\n",
|
|||
|
|
" shift = None\n",
|
|||
|
|
"\n",
|
|||
|
|
" for i in range(1, len(obs) - MIN_PERSISTENCE - 1):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # CASE 1 — persistent shift\n",
|
|||
|
|
" if (\n",
|
|||
|
|
" rel_gap[i] > REL_GAP_THR\n",
|
|||
|
|
" and np.all(np.abs(gap[i:i+MIN_PERSISTENCE] - gap[i]) < GAP_TOL)\n",
|
|||
|
|
" and np.all(np.abs(np.diff(flows[i:i+MIN_PERSISTENCE])) < GAP_TOL)\n",
|
|||
|
|
" ):\n",
|
|||
|
|
" idx = i\n",
|
|||
|
|
" shift = gap[i]\n",
|
|||
|
|
" break\n",
|
|||
|
|
"\n",
|
|||
|
|
" # CASE 2 — double shift\n",
|
|||
|
|
" if (\n",
|
|||
|
|
" rel_gap[i] > REL_GAP_THR\n",
|
|||
|
|
" and rel_gap[i+1] > REL_GAP_THR\n",
|
|||
|
|
" and np.all(np.abs(gap[i+1:i+1+MIN_PERSISTENCE] - gap[i+1]) < GAP_TOL)\n",
|
|||
|
|
" and np.all(np.abs(np.diff(flows[i+1:i+1+MIN_PERSISTENCE])) < GAP_TOL)\n",
|
|||
|
|
" ):\n",
|
|||
|
|
" idx = i\n",
|
|||
|
|
" shift = gap[i+1]\n",
|
|||
|
|
" break\n",
|
|||
|
|
"\n",
|
|||
|
|
" if idx is None:\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" candidate = obs[idx:] - shift\n",
|
|||
|
|
"\n",
|
|||
|
|
" if ((candidate < 0) & (obs[idx:] >= 0)).any():\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" if abs(shift) > 2 * np.nanmax(np.abs(obs)):\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
" corrected[idx:] = candidate\n",
|
|||
|
|
"\n",
|
|||
|
|
" g.loc[g.index[idx]:, \"repair_flag\"] = True\n",
|
|||
|
|
"\n",
|
|||
|
|
" expected_corr = np.empty_like(obs)\n",
|
|||
|
|
" expected_corr[0] = np.nan\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in range(1, len(obs)):\n",
|
|||
|
|
" expected_corr[t] = corrected[t-1] + flows[t-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" g[\"corrected_aum\"] = corrected\n",
|
|||
|
|
"\n",
|
|||
|
|
" return g\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# APPLY REPAIR\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"df_repaired = (\n",
|
|||
|
|
" df.groupby(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\"],\n",
|
|||
|
|
" group_keys=False\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .apply(repair_group)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# REBUILD STOCK FILE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired = stocks.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired[\"Quantity - AUM\"] = df_repaired[\"corrected_aum\"].fillna(\n",
|
|||
|
|
" stocks[\"Quantity - AUM\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# SAVE WITH ORIGINAL FORMAT\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired.to_csv(\n",
|
|||
|
|
" \"stock_repaired.csv\",\n",
|
|||
|
|
" sep=\";\",\n",
|
|||
|
|
" index=False\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"stock_repaired.csv successfully created\")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 15,
|
|||
|
|
"id": "f94f07b4-e053-4828-bbb1-3697f9a11751",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"/tmp/ipykernel_5465/3656779442.py:4: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
|||
|
|
" .apply(repair_group)\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"ename": "KeyError",
|
|||
|
|
"evalue": "'expected_stock_corr'",
|
|||
|
|
"output_type": "error",
|
|||
|
|
"traceback": [
|
|||
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812\u001b[39m, in \u001b[36mIndex.get_loc\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 3811\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m3812\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3813\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/index.pyx:167\u001b[39m, in \u001b[36mpandas._libs.index.IndexEngine.get_loc\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/index.pyx:196\u001b[39m, in \u001b[36mpandas._libs.index.IndexEngine.get_loc\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[39m, in \u001b[36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/hashtable_class_helper.pxi:7096\u001b[39m, in \u001b[36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m: 'expected_stock_corr'",
|
|||
|
|
"\nThe above exception was the direct cause of the following exception:\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
|||
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 25\u001b[39m\n\u001b[32m 22\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mprev_aum\u001b[39m\u001b[33m\"\u001b[39m] + df[\u001b[33m\"\u001b[39m\u001b[33mprev_flow\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 23\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mQuantity - AUM\u001b[39m\u001b[33m\"\u001b[39m] - df[\u001b[33m\"\u001b[39m\u001b[33mexpected_stock\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m25\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mcorrected_aum\u001b[39m\u001b[33m\"\u001b[39m] - \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexpected_stock_corr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 27\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mrupture_before\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mgap_before\u001b[39m\u001b[33m\"\u001b[39m].abs() > GAP_TOL\n\u001b[32m 29\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mrupture_after\u001b[39m\u001b[33m\"\u001b[39m] = df[\u001b[33m\"\u001b[39m\u001b[33mgap_after\u001b[39m\u001b[33m\"\u001b[39m].abs() > GAP_TOL\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4113\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.columns.nlevels > \u001b[32m1\u001b[39m:\n\u001b[32m 4112\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._getitem_multilevel(key)\n\u001b[32m-> \u001b[39m\u001b[32m4113\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4114\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[32m 4115\u001b[39m indexer = [indexer]\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:3819\u001b[39m, in \u001b[36mIndex.get_loc\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 3814\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[32m 3815\u001b[39m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc.Iterable)\n\u001b[32m 3816\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[32m 3817\u001b[39m ):\n\u001b[32m 3818\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[32m-> \u001b[39m\u001b[32m3819\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01merr\u001b[39;00m\n\u001b[32m 3820\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[32m 3821\u001b[39m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[32m 3822\u001b[39m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[32m 3823\u001b[39m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[32m 3824\u001b[39m \u001b[38;5;28mself\u001b[39m._check_indexing_error(key)\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m: 'expected_stock_corr'"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"df = (\n",
|
|||
|
|
" df\n",
|
|||
|
|
" .groupby([\"Registrar Account - ID\", \"Product - Isin\"], group_keys=False)\n",
|
|||
|
|
" .apply(repair_group)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# VALIDATION BEFORE / AFTER\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_flow\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_stock\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
|
|||
|
|
"df[\"gap_before\"] = df[\"Quantity - AUM\"] - df[\"expected_stock\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap_after\"] = df[\"corrected_aum\"] - df[\"expected_stock_corr\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"rupture_before\"] = df[\"gap_before\"].abs() > GAP_TOL\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"rupture_after\"] = df[\"gap_after\"].abs() > GAP_TOL\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SUMMARY\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"summary = pd.DataFrame({\n",
|
|||
|
|
" \"Before repair\": [df[\"rupture_before\"].sum()],\n",
|
|||
|
|
" \"After repair\": [df[\"rupture_after\"].sum()],\n",
|
|||
|
|
" \"Repaired points\": [df[\"repair_flag\"].sum()]\n",
|
|||
|
|
"})\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(summary)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# BUILD REPAIRED DATASET\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired = stocks.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
"repair_map = df[[\n",
|
|||
|
|
" \"Registrar Account - ID\",\n",
|
|||
|
|
" \"Product - Isin\",\n",
|
|||
|
|
" \"Centralisation Date\",\n",
|
|||
|
|
" \"corrected_aum\",\n",
|
|||
|
|
" \"repair_flag\"\n",
|
|||
|
|
"]]\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired = stocks_repaired.merge(\n",
|
|||
|
|
" repair_map,\n",
|
|||
|
|
" on=keys,\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired[\"Quantity - AUM\"] = np.where(\n",
|
|||
|
|
" stocks_repaired[\"repair_flag\"] == True,\n",
|
|||
|
|
" stocks_repaired[\"corrected_aum\"],\n",
|
|||
|
|
" stocks_repaired[\"Quantity - AUM\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks_repaired.to_csv(\"stock_repaired.csv\", index=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# COMPARISON RAW VS REPAIRED\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df_compare = stocks.merge(\n",
|
|||
|
|
" stocks_repaired,\n",
|
|||
|
|
" on=keys,\n",
|
|||
|
|
" how=\"inner\",\n",
|
|||
|
|
" suffixes=(\"_raw\",\"_repaired\")\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df_compare[\"aum_diff\"] = (\n",
|
|||
|
|
" df_compare[\"Quantity - AUM_repaired\"]\n",
|
|||
|
|
" - df_compare[\"Quantity - AUM_raw\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nNUMBER OF MODIFIED OBSERVATIONS:\",\n",
|
|||
|
|
" (df_compare[\"aum_diff\"] != 0).sum())\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"Share modified:\",\n",
|
|||
|
|
" round((df_compare[\"aum_diff\"] != 0).mean()*100,2), \"%\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nTOTAL AUM\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"Raw total :\", df_compare[\"Quantity - AUM_raw\"].sum())\n",
|
|||
|
|
"print(\"Repaired total :\", df_compare[\"Quantity - AUM_repaired\"].sum())\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# RUPTURE DISTRIBUTION BEFORE / AFTER\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"def rupture_distribution(df, flag):\n",
|
|||
|
|
"\n",
|
|||
|
|
" rupture_summary = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" .agg(\n",
|
|||
|
|
" n_ruptures=(flag,\"sum\"),\n",
|
|||
|
|
" total_obs=(flag,\"count\"),\n",
|
|||
|
|
" rupture_ratio=(flag,\"mean\")\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .reset_index()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" rs = rupture_summary.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" bins = [0,0.01,0.10,0.30,1.01]\n",
|
|||
|
|
"\n",
|
|||
|
|
" labels = [\n",
|
|||
|
|
" \"Clean / quasi-clean (≤1%)\",\n",
|
|||
|
|
" \"Moderate (1–10%)\",\n",
|
|||
|
|
" \"High (10–30%)\",\n",
|
|||
|
|
" \"Severe (>30%)\"\n",
|
|||
|
|
" ]\n",
|
|||
|
|
"\n",
|
|||
|
|
" rs[\"rupture_class\"] = pd.cut(\n",
|
|||
|
|
" rs[\"rupture_ratio\"],\n",
|
|||
|
|
" bins=bins,\n",
|
|||
|
|
" labels=labels,\n",
|
|||
|
|
" include_lowest=True\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" dist = (\n",
|
|||
|
|
" rs[\"rupture_class\"]\n",
|
|||
|
|
" .value_counts(normalize=True)\n",
|
|||
|
|
" .sort_index()\n",
|
|||
|
|
" * 100\n",
|
|||
|
|
" ).round(1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" return dist\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"dist_before = rupture_distribution(df,\"rupture_before\")\n",
|
|||
|
|
"dist_after = rupture_distribution(df,\"rupture_after\")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "54491736-58b3-4ef7-b6c4-5534ec796bce",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# DONUT CHART BEFORE / AFTER\n",
|
|||
|
|
"\n",
|
|||
|
|
"fig = go.Figure()\n",
|
|||
|
|
"\n",
|
|||
|
|
"fig.add_trace(go.Pie(\n",
|
|||
|
|
" labels=dist_before.index,\n",
|
|||
|
|
" values=dist_before.values,\n",
|
|||
|
|
" hole=0.45,\n",
|
|||
|
|
" name=\"Before repair\",\n",
|
|||
|
|
" domain=dict(x=[0,0.48]),\n",
|
|||
|
|
" textinfo=\"percent\"\n",
|
|||
|
|
"))\n",
|
|||
|
|
"\n",
|
|||
|
|
"fig.add_trace(go.Pie(\n",
|
|||
|
|
" labels=dist_after.index,\n",
|
|||
|
|
" values=dist_after.values,\n",
|
|||
|
|
" hole=0.45,\n",
|
|||
|
|
" name=\"After repair\",\n",
|
|||
|
|
" domain=dict(x=[0.52,1]),\n",
|
|||
|
|
" textinfo=\"percent\"\n",
|
|||
|
|
"))\n",
|
|||
|
|
"\n",
|
|||
|
|
"fig.update_layout(\n",
|
|||
|
|
" title=\"Rupture intensity distribution (Before vs After repair)\",\n",
|
|||
|
|
" annotations=[\n",
|
|||
|
|
" dict(text=\"Before repair\", x=0.22, y=0.5, showarrow=False),\n",
|
|||
|
|
" dict(text=\"After repair\", x=0.78, y=0.5, showarrow=False)\n",
|
|||
|
|
" ]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"fig.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "d844f6f0-c0f4-4f71-8280-1fd39ced83b7",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# LOAD DATA\n",
|
|||
|
|
"\n",
|
|||
|
|
"aum = pd.read_csv(\"stock_repaired.csv\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"aum[\"Centralisation Date\"] = pd.to_datetime(aum[\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# KEEP USEFUL COLUMNS\n",
|
|||
|
|
"\n",
|
|||
|
|
"aum = aum[[\n",
|
|||
|
|
" \"Registrar Account - ID\",\n",
|
|||
|
|
" \"Product - Isin\",\n",
|
|||
|
|
" \"Centralisation Date\",\n",
|
|||
|
|
" \"Quantity - AUM\",\n",
|
|||
|
|
" \"repair_flag\"\n",
|
|||
|
|
"]]\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = flows[[\n",
|
|||
|
|
" \"Registrar Account - ID\",\n",
|
|||
|
|
" \"Product - Isin\",\n",
|
|||
|
|
" \"Centralisation Date\",\n",
|
|||
|
|
" \"Quantity - NetFlows\"\n",
|
|||
|
|
"]]\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# AGGREGATE FLOWS\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = (\n",
|
|||
|
|
" flows\n",
|
|||
|
|
" .groupby(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" as_index=False\n",
|
|||
|
|
" )[\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# MERGE DATASETS\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = aum.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"Dataset size:\", df.shape)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SORT DATA\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# REBUILD ACCOUNTING IDENTITY\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_flow\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"prev_flow\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# GAPS\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap_rel\"] = (\n",
|
|||
|
|
" df[\"gap_abs\"] /\n",
|
|||
|
|
" df[\"expected_aum\"].abs().clip(lower=1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ACCOUNTING CONSISTENCY\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nACCOUNTING GAP DISTRIBUTION\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(df[\"gap_abs\"].describe())\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nRelative gap quantiles\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(df[\"gap_rel\"].quantile([0.90,0.95,0.99]))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# NEGATIVE AUM\n",
|
|||
|
|
"\n",
|
|||
|
|
"neg = (df[\"Quantity - AUM\"] < 0).sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nNEGATIVE AUM:\", neg)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# REPAIR RATE\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nREPAIR RATE\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(df[\"repair_flag\"].mean())\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# AUM JUMPS\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_obs\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"aum_jump\"] = (\n",
|
|||
|
|
" df[\"Quantity - AUM\"] /\n",
|
|||
|
|
" df[\"prev_obs\"].replace(0,np.nan)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nAUM JUMP QUANTILES\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(df[\"aum_jump\"].quantile([0.90,0.95,0.99]))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# VISUAL CHECK\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"def plot_series(account, isin):\n",
|
|||
|
|
"\n",
|
|||
|
|
" sub = df[\n",
|
|||
|
|
" (df[\"Registrar Account - ID\"] == account) &\n",
|
|||
|
|
" (df[\"Product - Isin\"] == isin)\n",
|
|||
|
|
" ]\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.figure(figsize=(8,3))\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.plot(\n",
|
|||
|
|
" sub[\"Centralisation Date\"],\n",
|
|||
|
|
" sub[\"Quantity - AUM\"],\n",
|
|||
|
|
" label=\"AUM\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.plot(\n",
|
|||
|
|
" sub[\"Centralisation Date\"],\n",
|
|||
|
|
" sub[\"expected_aum\"],\n",
|
|||
|
|
" linestyle=\"--\",\n",
|
|||
|
|
" label=\"Expected AUM\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.legend()\n",
|
|||
|
|
" plt.title(f\"Account {account} — ISIN {isin}\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "f7d759f7-64be-4d82-a79c-98cda407cfec",
|
|||
|
|
"metadata": {
|
|||
|
|
"scrolled": true
|
|||
|
|
},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# COMPUTE AUM CHANGE\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"delta_aum\"] = df[\"Quantity - AUM\"] - df[\"prev_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# FILTER VALID OBSERVATIONS\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"diag = df[\n",
|
|||
|
|
" df[\"prev_aum\"].notna() &\n",
|
|||
|
|
" df[\"flow_lag\"].notna()\n",
|
|||
|
|
"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SAMPLE FOR PLOTTING (dataset is large)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"sample = diag.sample(20000, random_state=1)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SCATTER PLOT\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.figure(figsize=(7,7))\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.scatter(\n",
|
|||
|
|
" sample[\"flow_lag\"],\n",
|
|||
|
|
" sample[\"delta_aum\"],\n",
|
|||
|
|
" alpha=0.3,\n",
|
|||
|
|
" s=5\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# perfect accounting identity\n",
|
|||
|
|
"x = np.linspace(\n",
|
|||
|
|
" sample[\"flow_lag\"].min(),\n",
|
|||
|
|
" sample[\"flow_lag\"].max(),\n",
|
|||
|
|
" 100\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.plot(x, x, color=\"red\", label=\"Perfect identity\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.xlabel(\"Flow (t-1)\")\n",
|
|||
|
|
"plt.ylabel(\"Δ AUM\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.title(\"AUM / Flow Accounting Diagnostic\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.legend()\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "d0a959c9-cfff-44cb-a1df-6c7275ec5b43",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"df[\"implied_return\"] = (\n",
|
|||
|
|
" df[\"Quantity - AUM\"] - df[\"prev_aum\"] - df[\"flow_lag\"]\n",
|
|||
|
|
") / df[\"prev_aum\"].replace(0, np.nan)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(df[\"implied_return\"].quantile([0.5,0.9,0.95,0.99]))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "15111e8a-7d87-4c37-8122-daafe90a1ad5",
|
|||
|
|
"metadata": {
|
|||
|
|
"scrolled": true
|
|||
|
|
},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# ADDITIONAL DATASET VALIDATION CHECKS (ROBUST VERSION)\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\n==============================\")\n",
|
|||
|
|
"print(\"ADDITIONAL DATA QUALITY CHECKS\")\n",
|
|||
|
|
"print(\"==============================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# RECOMPUTE KEY VARIABLES IF NEEDED\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values([\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"flow_lag\"] = df[\"flow_lag\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"df[\"gap_abs\"] = df[\"gap\"].abs()\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"delta_aum\"] = df[\"Quantity - AUM\"] - df[\"prev_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"implied_return\"] = (\n",
|
|||
|
|
" df[\"Quantity - AUM\"] - df[\"prev_aum\"] - df[\"flow_lag\"]\n",
|
|||
|
|
") / df[\"prev_aum\"].replace(0,np.nan)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"aum_jump\"] = (\n",
|
|||
|
|
" df[\"Quantity - AUM\"] /\n",
|
|||
|
|
" df[\"prev_aum\"].replace(0,np.nan)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 1. CHECK SERIES WHERE GAP IS STILL LARGE\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"remaining_gaps = df[df[\"gap_abs\"] > 10]\n",
|
|||
|
|
"\n",
|
|||
|
|
"series_remaining = (\n",
|
|||
|
|
" remaining_gaps\n",
|
|||
|
|
" .groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" .size()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nSERIES STILL WITH LARGE ACCOUNTING GAPS:\", len(series_remaining))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 2. GAP DISTRIBUTION\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nACCOUNTING GAP DISTRIBUTION\")\n",
|
|||
|
|
"print(df[\"gap_abs\"].quantile([0.5,0.9,0.95,0.99]))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 3. IMPLIED RETURNS DISTRIBUTION\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nIMPLIED RETURN DISTRIBUTION\")\n",
|
|||
|
|
"print(df[\"implied_return\"].quantile([0.5,0.9,0.95,0.99]))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 4. EXTREME RETURNS\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"extreme_returns = df[df[\"implied_return\"].abs() > 2]\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nOBSERVATIONS WITH EXTREME IMPLIED RETURNS (>200%):\",\n",
|
|||
|
|
" len(extreme_returns))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 5. FLOW / AUM ACCOUNTING CORRELATION\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"valid = df[\n",
|
|||
|
|
" df[\"prev_aum\"].notna() &\n",
|
|||
|
|
" df[\"flow_lag\"].notna()\n",
|
|||
|
|
"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"corr = valid[\"delta_aum\"].corr(valid[\"flow_lag\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nCORRELATION ΔAUM vs FLOW:\", corr)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 6. LARGE AUM JUMPS\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"large_jumps = df[df[\"aum_jump\"].abs() > 5]\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nLARGE AUM JUMPS (>5x):\", len(large_jumps))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 7. SERIES WITH HIGH GAP RATE\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"series_gap_rate = (\n",
|
|||
|
|
" (df[\"gap_abs\"] > 10)\n",
|
|||
|
|
" .groupby([df[\"Registrar Account - ID\"], df[\"Product - Isin\"]])\n",
|
|||
|
|
" .mean()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"problem_series = series_gap_rate[series_gap_rate > 0.2]\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nSERIES WITH >20% ACCOUNTING GAPS:\", len(problem_series))\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 8. TOTAL AUM STABILITY\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"if \"df_compare\" in globals():\n",
|
|||
|
|
"\n",
|
|||
|
|
" raw_total = df_compare[\"Quantity - AUM_raw\"].sum()\n",
|
|||
|
|
" repaired_total = df_compare[\"Quantity - AUM_repaired\"].sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\nTOTAL AUM RAW:\", raw_total)\n",
|
|||
|
|
" print(\"TOTAL AUM REPAIRED:\", repaired_total)\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"RELATIVE CHANGE:\",\n",
|
|||
|
|
" (repaired_total - raw_total) / raw_total)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 9. PROPORTION OF SERIES REPAIRED\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"if \"repair_flag\" in df.columns:\n",
|
|||
|
|
"\n",
|
|||
|
|
" series_repaired = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"repair_flag\"]\n",
|
|||
|
|
" .max()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\nSERIES WITH AT LEAST ONE REPAIR:\",\n",
|
|||
|
|
" series_repaired.mean())\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"# 10. WORST SERIES (MANUAL CHECK)\n",
|
|||
|
|
"# ------------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
"worst_series = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"gap_abs\"]\n",
|
|||
|
|
" .max()\n",
|
|||
|
|
" .sort_values(ascending=False)\n",
|
|||
|
|
" .head(10)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nWORST SERIES AFTER REPAIR\")\n",
|
|||
|
|
"print(worst_series)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "f9a4fd91-bb8b-4172-a267-cbe7f2e4fae7",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"print(\"RUPTURES BEFORE:\", summary[\"Before repair\"].iloc[0])\n",
|
|||
|
|
"print(\"RUPTURES AFTER :\", summary[\"After repair\"].iloc[0])\n",
|
|||
|
|
"print(\"REDUCTION RATE :\", 1 - summary[\"After repair\"].iloc[0] /\n",
|
|||
|
|
" summary[\"Before repair\"].iloc[0])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "68596521-a10a-479a-a6cd-36b6be3c55b9",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# CHECK IF REPAIR CREATED NEW NEGATIVE AUM\n",
|
|||
|
|
"\n",
|
|||
|
|
"df_compare = stocks.merge(\n",
|
|||
|
|
" stocks_repaired,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"inner\",\n",
|
|||
|
|
" suffixes=(\"_raw\",\"_repaired\")\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"neg_raw = (\n",
|
|||
|
|
" df_compare[df_compare[\"Quantity - AUM_raw\"] < 0]\n",
|
|||
|
|
" .groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" .size()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"neg_rep = (\n",
|
|||
|
|
" df_compare[df_compare[\"Quantity - AUM_repaired\"] < 0]\n",
|
|||
|
|
" .groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" .size()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"Negative series BEFORE repair:\", len(neg_raw))\n",
|
|||
|
|
"print(\"Negative series AFTER repair:\", len(neg_rep))\n",
|
|||
|
|
"\n",
|
|||
|
|
"created_neg = set(neg_rep.index) - set(neg_raw.index)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nNumber of series where repair created negatives:\", len(created_neg))\n",
|
|||
|
|
"\n",
|
|||
|
|
"if len(created_neg) > 0:\n",
|
|||
|
|
" print(\"\\nSeries concerned:\")\n",
|
|||
|
|
" print(created_neg)\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"for acc, isin in created_neg:\n",
|
|||
|
|
"\n",
|
|||
|
|
" sub = df_compare[\n",
|
|||
|
|
" (df_compare[\"Registrar Account - ID\"] == acc) &\n",
|
|||
|
|
" (df_compare[\"Product - Isin\"] == isin)\n",
|
|||
|
|
" ].sort_values(\"Centralisation Date\").reset_index(drop=True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # indices where repaired AUM becomes negative\n",
|
|||
|
|
" neg_idx = sub.index[sub[\"Quantity - AUM_repaired\"] < 0]\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\n======================================\")\n",
|
|||
|
|
" print(\"Account:\", acc, \"ISIN:\", isin)\n",
|
|||
|
|
"\n",
|
|||
|
|
" for i in neg_idx:\n",
|
|||
|
|
"\n",
|
|||
|
|
" start = max(0, i-3)\n",
|
|||
|
|
" end = min(len(sub), i+3)\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\nContext around created negative:\")\n",
|
|||
|
|
" print(\n",
|
|||
|
|
" sub.loc[start:end, [\n",
|
|||
|
|
" \"Centralisation Date\",\n",
|
|||
|
|
" \"Quantity - AUM_raw\",\n",
|
|||
|
|
" \"Quantity - AUM_repaired\"\n",
|
|||
|
|
" ]]\n",
|
|||
|
|
" )"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "203797d1-c380-406d-ac6e-78c4e1228966",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# top N worst series\n",
|
|||
|
|
"N = 20\n",
|
|||
|
|
"\n",
|
|||
|
|
"worst_series = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])[\"gap_abs\"]\n",
|
|||
|
|
" .max()\n",
|
|||
|
|
" .sort_values(ascending=False)\n",
|
|||
|
|
" .head(N)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(worst_series)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "17c6be03-3b76-41e7-906f-d86472bda274",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def plot_account_series(account, isin):\n",
|
|||
|
|
"\n",
|
|||
|
|
" sub = df[\n",
|
|||
|
|
" (df[\"Registrar Account - ID\"] == account) &\n",
|
|||
|
|
" (df[\"Product - Isin\"] == isin)\n",
|
|||
|
|
" ].sort_values(\"Centralisation Date\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.figure(figsize=(10,4))\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.plot(\n",
|
|||
|
|
" sub[\"Centralisation Date\"],\n",
|
|||
|
|
" sub[\"Quantity - AUM\"],\n",
|
|||
|
|
" label=\"AUM\",\n",
|
|||
|
|
" linewidth=2\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.plot(\n",
|
|||
|
|
" sub[\"Centralisation Date\"],\n",
|
|||
|
|
" sub[\"expected_aum\"],\n",
|
|||
|
|
" linestyle=\"--\",\n",
|
|||
|
|
" label=\"Expected AUM\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # highlight large gaps\n",
|
|||
|
|
" ruptures = sub[sub[\"gap_abs\"] > 10]\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.scatter(\n",
|
|||
|
|
" ruptures[\"Centralisation Date\"],\n",
|
|||
|
|
" ruptures[\"Quantity - AUM\"],\n",
|
|||
|
|
" color=\"red\",\n",
|
|||
|
|
" label=\"Rupture\",\n",
|
|||
|
|
" s=40\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.title(f\"Account {account} | ISIN {isin}\")\n",
|
|||
|
|
" plt.legend()\n",
|
|||
|
|
" plt.grid(alpha=0.3)\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "02a22e9a-a71c-4212-8f0a-f33c18e4b530",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"for acc, isin in worst_series.index:\n",
|
|||
|
|
" plot_account_series(acc, isin)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "c72e2dc2-c35d-4608-a3da-57a37acf64c7",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def run_data_challenge(stocks, flows):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # conversion dates\n",
|
|||
|
|
" stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
" flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # merge datasets\n",
|
|||
|
|
" df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # sort\n",
|
|||
|
|
" df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # previous values\n",
|
|||
|
|
" df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" ).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # expected AUM\n",
|
|||
|
|
" df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # gap\n",
|
|||
|
|
" df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # build score\n",
|
|||
|
|
" df[\"score\"] = np.exp(-np.abs(df[\"gap\"]) / (df[\"expected_aum\"].abs()+1))\n",
|
|||
|
|
"\n",
|
|||
|
|
" # score timeline\n",
|
|||
|
|
" score_timeline = (\n",
|
|||
|
|
" df.groupby(\"Centralisation Date\")[\"score\"]\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
" .reset_index()\n",
|
|||
|
|
" .rename(columns={\"Centralisation Date\":\"date\",\n",
|
|||
|
|
" \"score\":\"sum_scores\"})\n",
|
|||
|
|
" .sort_values(\"date\")\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = pd.DataFrame() # placeholder\n",
|
|||
|
|
"\n",
|
|||
|
|
" return score_timeline, code_changes"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "67787ce5-39ae-4a3a-ba22-e9d38bcaf8d3",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# RUN DATA CHALLENGE ON RAW AND CLEAN DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"DATASETS = {\n",
|
|||
|
|
" \"raw\": \"stocks.csv\",\n",
|
|||
|
|
" \"clean\": \"stock_repaired.csv\"\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"results = {}\n",
|
|||
|
|
"\n",
|
|||
|
|
"for name, file in DATASETS.items():\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\n====================================\")\n",
|
|||
|
|
" print(\"RUNNING DATA CHALLENGE ON:\", name)\n",
|
|||
|
|
" print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" # load datasets\n",
|
|||
|
|
" stocks = pd.read_csv(file, low_memory=False)\n",
|
|||
|
|
" flows = pd.read_csv(\"flows.csv\", low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # run the full algorithm\n",
|
|||
|
|
" score_timeline, code_changes = run_data_challenge(stocks, flows)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # store results\n",
|
|||
|
|
" results[name] = score_timeline.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# BUILD COMPARISON TABLE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"comparison = (\n",
|
|||
|
|
" results[\"raw\"][[\"date\", \"sum_scores\"]]\n",
|
|||
|
|
" .rename(columns={\"sum_scores\": \"raw_scores\"})\n",
|
|||
|
|
" .merge(\n",
|
|||
|
|
" results[\"clean\"][[\"date\", \"sum_scores\"]]\n",
|
|||
|
|
" .rename(columns={\"sum_scores\": \"clean_scores\"}),\n",
|
|||
|
|
" on=\"date\",\n",
|
|||
|
|
" how=\"outer\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .sort_values(\"date\")\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# improvement from cleaning\n",
|
|||
|
|
"comparison[\"improvement\"] = (\n",
|
|||
|
|
" comparison[\"clean_scores\"] - comparison[\"raw_scores\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# relative improvement\n",
|
|||
|
|
"comparison[\"relative_improvement\"] = (\n",
|
|||
|
|
" comparison[\"improvement\"] /\n",
|
|||
|
|
" comparison[\"raw_scores\"].replace(0, np.nan)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\n==============================\")\n",
|
|||
|
|
"print(\"RAW VS CLEAN SCORE COMPARISON\")\n",
|
|||
|
|
"print(\"==============================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(comparison.head())\n",
|
|||
|
|
"print(comparison.tail())"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "5bdb367f-3764-400a-a20d-793f8d004d82",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# PARAMETERS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"TARGET_DATE = pd.Timestamp(\"2025-10-31\")\n",
|
|||
|
|
"AUM_THRESHOLD = 5_000_000\n",
|
|||
|
|
"EXCLUDED = [\"OFF DISTRIBUTION\", \"PRIVATE CLIENTS\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"ALPHA = 5 # penalty strength for accounting error\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks[\"Registrar Account - ID\"] = stocks[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
"flows[\"Registrar Account - ID\"] = flows[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks[\"Product - Isin\"] = stocks[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"flows[\"Product - Isin\"] = flows[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# REMOVE EXCLUDED ACCOUNTS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"stocks = stocks[~stocks[\"Registrar Account - ID\"].str.upper().isin(EXCLUDED)]\n",
|
|||
|
|
"flows = flows[~flows[\"Registrar Account - ID\"].str.upper().isin(EXCLUDED)]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "b10832d6-988a-42cd-a6b6-4dfb690315b7",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# SELECT UNIVERSE AT TARGET DATE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"latest = stocks[stocks[\"Centralisation Date\"] == TARGET_DATE]\n",
|
|||
|
|
"\n",
|
|||
|
|
"account_aum = (\n",
|
|||
|
|
" latest.groupby(\"Registrar Account - ID\")[\"Quantity - AUM\"]\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
" .sort_values(ascending=False)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# remove negative values if any\n",
|
|||
|
|
"account_aum = account_aum[account_aum > 0]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# cumulative coverage\n",
|
|||
|
|
"cum_aum = account_aum.cumsum()\n",
|
|||
|
|
"total_aum = account_aum.sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
"coverage = cum_aum / total_aum\n",
|
|||
|
|
"\n",
|
|||
|
|
"# select accounts covering 97% of AUM\n",
|
|||
|
|
"selected_accounts = account_aum[coverage <= 0.97]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ensure at least one more account\n",
|
|||
|
|
"selected_accounts = account_aum.iloc[:len(selected_accounts)+1]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# weights at t0\n",
|
|||
|
|
"weights = selected_accounts / selected_accounts.sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nUNIVERSE SELECTION\")\n",
|
|||
|
|
"print(\"------------------\")\n",
|
|||
|
|
"print(\"Number of selected accounts:\", len(selected_accounts))\n",
|
|||
|
|
"print(\"Coverage:\", selected_accounts.sum() / total_aum)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nTop accounts:\")\n",
|
|||
|
|
"print(selected_accounts.head(10))\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# BUILD DATASET\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# COMPUTE PREVIOUS VALUES\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
").fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"df[\"rel_error\"] = (\n",
|
|||
|
|
" df[\"gap\"].abs() /\n",
|
|||
|
|
" df[\"expected_aum\"].abs().clip(lower=1)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# ACCOUNT LEVEL ERROR\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"account_error = (\n",
|
|||
|
|
" df.groupby([\"Centralisation Date\",\"Registrar Account - ID\"])\n",
|
|||
|
|
" [\"rel_error\"]\n",
|
|||
|
|
" .mean()\n",
|
|||
|
|
" .reset_index()\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# FAST SCORE BACKWARD PROPAGATION\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"# pivot errors into matrix\n",
|
|||
|
|
"error_matrix = (\n",
|
|||
|
|
" account_error\n",
|
|||
|
|
" .pivot(\n",
|
|||
|
|
" index=\"Centralisation Date\",\n",
|
|||
|
|
" columns=\"Registrar Account - ID\",\n",
|
|||
|
|
" values=\"rel_error\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# keep only selected accounts\n",
|
|||
|
|
"error_matrix = error_matrix[selected_accounts.index]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# fill missing errors\n",
|
|||
|
|
"error_matrix = error_matrix.fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# keep only dates <= target\n",
|
|||
|
|
"error_matrix = error_matrix.loc[error_matrix.index <= TARGET_DATE]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# sort dates\n",
|
|||
|
|
"error_matrix = error_matrix.sort_index()\n",
|
|||
|
|
"\n",
|
|||
|
|
"dates = error_matrix.index.values\n",
|
|||
|
|
"\n",
|
|||
|
|
"# convert to numpy for speed\n",
|
|||
|
|
"errors = error_matrix.values\n",
|
|||
|
|
"\n",
|
|||
|
|
"# initial scores\n",
|
|||
|
|
"scores = weights.loc[selected_accounts.index].values\n",
|
|||
|
|
"\n",
|
|||
|
|
"score_history = []\n",
|
|||
|
|
"\n",
|
|||
|
|
"# backward propagation\n",
|
|||
|
|
"for i in range(len(dates)-1, -1, -1):\n",
|
|||
|
|
"\n",
|
|||
|
|
" err = errors[i]\n",
|
|||
|
|
"\n",
|
|||
|
|
" quality = np.exp(-ALPHA * err)\n",
|
|||
|
|
"\n",
|
|||
|
|
" scores = scores * quality\n",
|
|||
|
|
"\n",
|
|||
|
|
" score_history.append({\n",
|
|||
|
|
" \"date\": dates[i],\n",
|
|||
|
|
" \"sum_scores\": scores.sum()\n",
|
|||
|
|
" })\n",
|
|||
|
|
"\n",
|
|||
|
|
"score_timeline = (\n",
|
|||
|
|
" pd.DataFrame(score_history)\n",
|
|||
|
|
" .sort_values(\"date\")\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# NORMALISE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"initial_score = score_timeline[\"sum_scores\"].iloc[-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
"score_timeline[\"score_retention\"] = (\n",
|
|||
|
|
" score_timeline[\"sum_scores\"] / initial_score\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# RESULTS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nSCORE TIMELINE\")\n",
|
|||
|
|
"print(score_timeline.head())\n",
|
|||
|
|
"print(score_timeline.tail())\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# PLOT\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.figure(figsize=(8,4))\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.plot(\n",
|
|||
|
|
" score_timeline[\"date\"],\n",
|
|||
|
|
" score_timeline[\"score_retention\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.title(\"Score retention when moving backward in time\")\n",
|
|||
|
|
"plt.xlabel(\"Date\")\n",
|
|||
|
|
"plt.ylabel(\"Σ Scores / Σ Scores (t0)\")\n",
|
|||
|
|
"plt.grid(alpha=0.3)\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.show()\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# SAVE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"score_timeline.to_csv(\n",
|
|||
|
|
" \"data_challenge_score_timeline.csv\",\n",
|
|||
|
|
" index=False\n",
|
|||
|
|
")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "86d82213-d155-4feb-817e-ef1df50578e8",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# PRECOMPUTE PORTFOLIOS BY DATE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"portfolio_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in portfolio.groupby(level=0)\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in flows_matrix.groupby(level=0)\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# CODE SURGERY SEARCH (FAST + ROBUST)\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"code_changes = []\n",
|
|||
|
|
"\n",
|
|||
|
|
"for row in ruptures.itertuples():\n",
|
|||
|
|
"\n",
|
|||
|
|
" date = row._1\n",
|
|||
|
|
" acc = row._2\n",
|
|||
|
|
"\n",
|
|||
|
|
" # find previous date\n",
|
|||
|
|
" prev_date = score_timeline.loc[\n",
|
|||
|
|
" score_timeline[\"date\"] < date,\n",
|
|||
|
|
" \"date\"\n",
|
|||
|
|
" ].max()\n",
|
|||
|
|
"\n",
|
|||
|
|
" if pd.isna(prev_date):\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if prev_date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" # portfolio at t\n",
|
|||
|
|
" port_today = portfolio_by_date[date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if acc not in port_today.index:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t = port_today.loc[acc]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # portfolio at t-1\n",
|
|||
|
|
" prev_port = portfolio_by_date[prev_date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # flows at t-1\n",
|
|||
|
|
" prev_flow = flows_by_date.get(prev_date)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # align flows to portfolio\n",
|
|||
|
|
" if prev_flow is not None:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = (\n",
|
|||
|
|
" prev_flow\n",
|
|||
|
|
" .reindex(index=prev_port.index, columns=prev_port.columns)\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = pd.DataFrame(\n",
|
|||
|
|
" 0,\n",
|
|||
|
|
" index=prev_port.index,\n",
|
|||
|
|
" columns=prev_port.columns\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # convert to numpy\n",
|
|||
|
|
" prev_port_mat = prev_port.values\n",
|
|||
|
|
" prev_flow_mat = prev_flow.values\n",
|
|||
|
|
"\n",
|
|||
|
|
" # predicted portfolio\n",
|
|||
|
|
" predicted = prev_port_mat + prev_flow_mat\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t_vec = port_t.reindex(prev_port.columns).fillna(0).values\n",
|
|||
|
|
"\n",
|
|||
|
|
" # compute error vectorised\n",
|
|||
|
|
" diff = np.abs(predicted - port_t_vec)\n",
|
|||
|
|
"\n",
|
|||
|
|
" errors = diff.sum(axis=1) / (np.abs(port_t_vec).sum() + 1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" if len(errors) == 0:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_idx = errors.argmin()\n",
|
|||
|
|
"\n",
|
|||
|
|
" if best_idx >= len(prev_port.index):\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_code = prev_port.index[best_idx]\n",
|
|||
|
|
" best_error = errors[best_idx]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if best_code != acc and best_error < 0.3:\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes.append({\n",
|
|||
|
|
" \"date\": date,\n",
|
|||
|
|
" \"old_code\": acc,\n",
|
|||
|
|
" \"new_code\": best_code,\n",
|
|||
|
|
" \"portfolio_error\": best_error\n",
|
|||
|
|
" })\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# RESULTS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"code_changes = pd.DataFrame(code_changes)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nDetected distributor code changes:\")\n",
|
|||
|
|
"print(code_changes.head())\n",
|
|||
|
|
"\n",
|
|||
|
|
"code_changes.to_csv(\"detected_code_changes.csv\", index=False)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "452b8321-26c5-4229-9992-43c38eb5253f",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def detect_code_changes(portfolio, flows_matrix, ruptures, score_timeline):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # PRECOMPUTE PORTFOLIOS BY DATE\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" portfolio_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in portfolio.groupby(level=0)\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" flows_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in flows_matrix.groupby(level=0)\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # CODE SURGERY SEARCH\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = []\n",
|
|||
|
|
"\n",
|
|||
|
|
" for row in ruptures.itertuples():\n",
|
|||
|
|
"\n",
|
|||
|
|
" date = row._1\n",
|
|||
|
|
" acc = row._2\n",
|
|||
|
|
"\n",
|
|||
|
|
" # find previous date\n",
|
|||
|
|
" prev_date = score_timeline.loc[\n",
|
|||
|
|
" score_timeline[\"date\"] < date,\n",
|
|||
|
|
" \"date\"\n",
|
|||
|
|
" ].max()\n",
|
|||
|
|
"\n",
|
|||
|
|
" if pd.isna(prev_date):\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if prev_date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" # portfolio at t\n",
|
|||
|
|
" port_today = portfolio_by_date[date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if acc not in port_today.index:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t = port_today.loc[acc]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # portfolio at t-1\n",
|
|||
|
|
" prev_port = portfolio_by_date[prev_date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # flows at t-1\n",
|
|||
|
|
" prev_flow = flows_by_date.get(prev_date)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # align flows to portfolio\n",
|
|||
|
|
" if prev_flow is not None:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = (\n",
|
|||
|
|
" prev_flow\n",
|
|||
|
|
" .reindex(index=prev_port.index, columns=prev_port.columns)\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = pd.DataFrame(\n",
|
|||
|
|
" 0,\n",
|
|||
|
|
" index=prev_port.index,\n",
|
|||
|
|
" columns=prev_port.columns\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # convert to numpy\n",
|
|||
|
|
" prev_port_mat = prev_port.values\n",
|
|||
|
|
" prev_flow_mat = prev_flow.values\n",
|
|||
|
|
"\n",
|
|||
|
|
" # predicted portfolio\n",
|
|||
|
|
" predicted = prev_port_mat + prev_flow_mat\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t_vec = port_t.reindex(prev_port.columns).fillna(0).values\n",
|
|||
|
|
"\n",
|
|||
|
|
" # compute error vectorised\n",
|
|||
|
|
" diff = np.abs(predicted - port_t_vec)\n",
|
|||
|
|
"\n",
|
|||
|
|
" errors = diff.sum(axis=1) / (np.abs(port_t_vec).sum() + 1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" if len(errors) == 0:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_idx = errors.argmin()\n",
|
|||
|
|
"\n",
|
|||
|
|
" if best_idx >= len(prev_port.index):\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_code = prev_port.index[best_idx]\n",
|
|||
|
|
" best_error = errors[best_idx]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if best_code != acc and best_error < 0.3:\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes.append({\n",
|
|||
|
|
" \"date\": date,\n",
|
|||
|
|
" \"old_code\": acc,\n",
|
|||
|
|
" \"new_code\": best_code,\n",
|
|||
|
|
" \"portfolio_error\": best_error\n",
|
|||
|
|
" })\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # RESULTS\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = pd.DataFrame(code_changes)\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\nDetected distributor code changes:\")\n",
|
|||
|
|
" print(code_changes.head())\n",
|
|||
|
|
"\n",
|
|||
|
|
" # nouveau nom de fichier\n",
|
|||
|
|
" code_changes.to_csv(\"detected_code_changes_filtered.csv\", index=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" return code_changes\n",
|
|||
|
|
"\n",
|
|||
|
|
"code_changes = detect_code_changes(\n",
|
|||
|
|
" portfolio,\n",
|
|||
|
|
" flows_matrix,\n",
|
|||
|
|
" ruptures,\n",
|
|||
|
|
" score_timeline\n",
|
|||
|
|
")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 4,
|
|||
|
|
"id": "068da1e3-9de7-49d1-bda4-663b02f6d76a",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def detect_code_changes_fast(portfolio, flows_matrix, ruptures, score_timeline):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # PRECOMPUTE PORTFOLIOS BY DATE\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" portfolio_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in portfolio.groupby(level=0)\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" flows_by_date = {\n",
|
|||
|
|
" d: g.droplevel(0)\n",
|
|||
|
|
" for d, g in flows_matrix.groupby(level=0)\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # PRECOMPUTE PREVIOUS DATES\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" dates = sorted(score_timeline[\"date\"].unique())\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_date_map = {\n",
|
|||
|
|
" dates[i]: dates[i - 1] if i > 0 else None\n",
|
|||
|
|
" for i in range(len(dates))\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # CODE SURGERY SEARCH\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = []\n",
|
|||
|
|
"\n",
|
|||
|
|
" from tqdm import tqdm\n",
|
|||
|
|
" \n",
|
|||
|
|
" for row in tqdm(ruptures.itertuples(), total=len(ruptures)):\n",
|
|||
|
|
" date = row._1\n",
|
|||
|
|
" acc = row._2\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_date = prev_date_map.get(date)\n",
|
|||
|
|
"\n",
|
|||
|
|
" if prev_date is None:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" if prev_date not in portfolio_by_date:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_today = portfolio_by_date[date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if acc not in port_today.index:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t = port_today.loc[acc]\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_port = portfolio_by_date[prev_date]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ========================================================\n",
|
|||
|
|
" # LIMIT CANDIDATES (shared ISINs)\n",
|
|||
|
|
" # ========================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" held_isins = port_t[port_t > 0].index\n",
|
|||
|
|
"\n",
|
|||
|
|
" if len(held_isins) == 0:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" candidate_mask = prev_port[held_isins].sum(axis=1) > 0\n",
|
|||
|
|
"\n",
|
|||
|
|
" candidates = prev_port.index[candidate_mask]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if len(candidates) == 0:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_port = prev_port.loc[candidates]\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = flows_by_date.get(prev_date)\n",
|
|||
|
|
"\n",
|
|||
|
|
" if prev_flow is not None:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = prev_flow.reindex(\n",
|
|||
|
|
" index=candidates,\n",
|
|||
|
|
" columns=prev_port.columns\n",
|
|||
|
|
" ).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_flow = pd.DataFrame(\n",
|
|||
|
|
" 0,\n",
|
|||
|
|
" index=candidates,\n",
|
|||
|
|
" columns=prev_port.columns\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ========================================================\n",
|
|||
|
|
" # VECTORISED ERROR COMPUTATION\n",
|
|||
|
|
" # ========================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" prev_port_mat = prev_port.values\n",
|
|||
|
|
" prev_flow_mat = prev_flow.values\n",
|
|||
|
|
"\n",
|
|||
|
|
" predicted = prev_port_mat + prev_flow_mat\n",
|
|||
|
|
"\n",
|
|||
|
|
" port_t_vec = port_t.reindex(prev_port.columns).fillna(0).values\n",
|
|||
|
|
"\n",
|
|||
|
|
" diff = np.abs(predicted - port_t_vec)\n",
|
|||
|
|
"\n",
|
|||
|
|
" errors = diff.sum(axis=1) / (np.abs(port_t_vec).sum() + 1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" if len(errors) == 0:\n",
|
|||
|
|
" continue\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_idx = errors.argmin()\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_code = prev_port.index[best_idx]\n",
|
|||
|
|
" best_error = errors[best_idx]\n",
|
|||
|
|
"\n",
|
|||
|
|
" if best_code != acc and best_error < 0.3:\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes.append({\n",
|
|||
|
|
" \"date\": date,\n",
|
|||
|
|
" \"old_code\": acc,\n",
|
|||
|
|
" \"new_code\": best_code,\n",
|
|||
|
|
" \"portfolio_error\": best_error\n",
|
|||
|
|
" })\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # RESULTS\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = pd.DataFrame(code_changes)\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\nDetected distributor code changes:\")\n",
|
|||
|
|
" print(code_changes.head())\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes.to_csv(\"detected_code_changes_fast.csv\", index=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" return code_changes"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 3,
|
|||
|
|
"id": "2b332049-db18-470a-9249-248f01e8ca36",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
" 11%|█ | 2068/18582 [00:24<03:14, 85.09it/s] \n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"ename": "KeyboardInterrupt",
|
|||
|
|
"evalue": "",
|
|||
|
|
"output_type": "error",
|
|||
|
|
"traceback": [
|
|||
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|||
|
|
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
|||
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 145\u001b[39m\n\u001b[32m 138\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m code_changes\n\u001b[32m 141\u001b[39m \u001b[38;5;66;03m# ============================================================\u001b[39;00m\n\u001b[32m 142\u001b[39m \u001b[38;5;66;03m# RUN\u001b[39;00m\n\u001b[32m 143\u001b[39m \u001b[38;5;66;03m# ============================================================\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m145\u001b[39m code_changes = \u001b[43mdetect_code_changes_fast\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 146\u001b[39m \u001b[43m \u001b[49m\u001b[43mportfolio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 147\u001b[39m \u001b[43m \u001b[49m\u001b[43mflows_matrix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 148\u001b[39m \u001b[43m \u001b[49m\u001b[43mruptures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 149\u001b[39m \u001b[43m \u001b[49m\u001b[43mscore_timeline\u001b[49m\n\u001b[32m 150\u001b[39m \u001b[43m)\u001b[49m\n",
|
|||
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 69\u001b[39m, in \u001b[36mdetect_code_changes_fast\u001b[39m\u001b[34m(portfolio, flows_matrix, ruptures, score_timeline)\u001b[39m\n\u001b[32m 66\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(held_isins) == \u001b[32m0\u001b[39m:\n\u001b[32m 67\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m69\u001b[39m candidate_mask = \u001b[43mprev_port\u001b[49m\u001b[43m[\u001b[49m\u001b[43mheld_isins\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m > \u001b[32m0\u001b[39m\n\u001b[32m 71\u001b[39m candidates = prev_port.index[candidate_mask]\n\u001b[32m 73\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(candidates) == \u001b[32m0\u001b[39m:\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:11697\u001b[39m, in \u001b[36mDataFrame.sum\u001b[39m\u001b[34m(self, axis, skipna, numeric_only, min_count, **kwargs)\u001b[39m\n\u001b[32m 11688\u001b[39m \u001b[38;5;129m@doc\u001b[39m(make_doc(\u001b[33m\"\u001b[39m\u001b[33msum\u001b[39m\u001b[33m\"\u001b[39m, ndim=\u001b[32m2\u001b[39m))\n\u001b[32m 11689\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34msum\u001b[39m(\n\u001b[32m 11690\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 11695\u001b[39m **kwargs,\n\u001b[32m 11696\u001b[39m ):\n\u001b[32m> \u001b[39m\u001b[32m11697\u001b[39m result = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 11698\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result.__finalize__(\u001b[38;5;28mself\u001b[39m, method=\u001b[33m\"\u001b[39m\u001b[33msum\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/generic.py:12571\u001b[39m, in \u001b[36mNDFrame.sum\u001b[39m\u001b[34m(self, axis, skipna, numeric_only, min_count, **kwargs)\u001b[39m\n\u001b[32m 12563\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34msum\u001b[39m(\n\u001b[32m 12564\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 12565\u001b[39m axis: Axis | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[32m0\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 12569\u001b[39m **kwargs,\n\u001b[32m 12570\u001b[39m ):\n\u001b[32m> \u001b[39m\u001b[32m12571\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_min_count_stat_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 12572\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msum\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnanops\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnansum\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 12573\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/generic.py:12554\u001b[39m, in \u001b[36mNDFrame._min_count_stat_function\u001b[39m\u001b[34m(self, name, func, axis, skipna, numeric_only, min_count, **kwargs)\u001b[39m\n\u001b[32m 12551\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m axis \u001b[38;5;129;01mis\u001b[39;00m lib.no_default:\n\u001b[32m 12552\u001b[39m axis = \u001b[32m0\u001b[39m\n\u001b[32m> \u001b[39m\u001b[32m12554\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_reduce\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 12555\u001b[39m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12556\u001b[39m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m=\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12557\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12558\u001b[39m \u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m=\u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12559\u001b[39m \u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12560\u001b[39m \u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12561\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:11593\u001b[39m, in \u001b[36mDataFrame._reduce\u001b[39m\u001b[34m(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)\u001b[39m\n\u001b[32m 11591\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m out_dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m out.dtype != \u001b[33m\"\u001b[39m\u001b[33mboolean\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 11592\u001b[39m out = out.astype(out_dtype)\n\u001b[32m> \u001b[39m\u001b[32m11593\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m (\u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_mgr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_dtypes\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m == \u001b[38;5;28mobject\u001b[39m).any() \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\u001b[33m\"\u001b[39m\u001b[33many\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mall\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 11594\u001b[39m out = out.astype(\u001b[38;5;28mobject\u001b[39m)\n\u001b[32m 11595\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m) == \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m out.dtype == \u001b[38;5;28mobject\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m (\u001b[33m\"\u001b[39m\u001b[33msum\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mprod\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 11596\u001b[39m \u001b[38;5;66;03m# Even if we are object dtype, follow numpy and return\u001b[39;00m\n\u001b[32m 11597\u001b[39m \u001b[38;5;66;03m# float64, see test_apply_funcs_over_empty\u001b[39;00m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/internals/managers.py:289\u001b[39m, in \u001b[36mBaseBlockManager.get_dtypes\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 287\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_dtypes\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> npt.NDArray[np.object_]:\n\u001b[32m 288\u001b[39m dtypes = np.array([blk.dtype \u001b[38;5;28;01mfor\u001b[39;00m blk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.blocks], dtype=\u001b[38;5;28mobject\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m289\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dtypes.take(\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mblknos\u001b[49m)\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/internals/managers.py:192\u001b[39m, in \u001b[36mBaseBlockManager.blknos\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 182\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 183\u001b[39m \u001b[33;03mSuppose we want to find the array corresponding to our i'th column.\u001b[39;00m\n\u001b[32m 184\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 188\u001b[39m \u001b[33;03mself.blocks[self.blknos[i]]\u001b[39;00m\n\u001b[32m 189\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 190\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._blknos \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 191\u001b[39m \u001b[38;5;66;03m# Note: these can be altered by other BlockManager methods.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m192\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_rebuild_blknos_and_blklocs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 194\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._blknos\n",
|
|||
|
|
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# RUN\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"code_changes = detect_code_changes_fast(\n",
|
|||
|
|
" portfolio,\n",
|
|||
|
|
" flows_matrix,\n",
|
|||
|
|
" ruptures,\n",
|
|||
|
|
" score_timeline\n",
|
|||
|
|
")"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 5,
|
|||
|
|
"id": "3054206e-13b8-4931-a1d8-e3cbb97eab7a",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\n",
|
|||
|
|
"====================================\n",
|
|||
|
|
"RUNNING DATA CHALLENGE ON: raw\n",
|
|||
|
|
"====================================\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"100%|██████████| 18582/18582 [02:27<00:00, 126.15it/s]\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\n",
|
|||
|
|
"Detected distributor code changes:\n",
|
|||
|
|
" date old_code new_code portfolio_error\n",
|
|||
|
|
"0 2015-02-28 200001285 200001992 0.090028\n",
|
|||
|
|
"1 2015-02-28 200001771 420304 0.049979\n",
|
|||
|
|
"2 2015-02-28 200001894 366541 0.215146\n",
|
|||
|
|
"3 2015-02-28 200002064 412736 0.210827\n",
|
|||
|
|
"4 2015-02-28 200002109 406337 0.173276\n",
|
|||
|
|
"\n",
|
|||
|
|
"====================================\n",
|
|||
|
|
"RUNNING DATA CHALLENGE ON: clean\n",
|
|||
|
|
"====================================\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"100%|██████████| 18521/18521 [02:34<00:00, 119.82it/s]\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\n",
|
|||
|
|
"Detected distributor code changes:\n",
|
|||
|
|
" date old_code new_code portfolio_error\n",
|
|||
|
|
"0 2015-02-28 200001285 200001992 0.090028\n",
|
|||
|
|
"1 2015-02-28 200001771 420304 0.049979\n",
|
|||
|
|
"2 2015-02-28 200001894 366541 0.215146\n",
|
|||
|
|
"3 2015-02-28 200002064 412736 0.210827\n",
|
|||
|
|
"4 2015-02-28 200002109 406337 0.173276\n",
|
|||
|
|
"\n",
|
|||
|
|
"==============================\n",
|
|||
|
|
"RAW VS CLEAN SCORE COMPARISON\n",
|
|||
|
|
"==============================\n",
|
|||
|
|
" date raw_scores clean_scores improvement relative_improvement\n",
|
|||
|
|
"0 2015-01-31 5639.521248 5639.524696 0.003448 6.113170e-07\n",
|
|||
|
|
"1 2015-02-28 5639.521248 5639.524696 0.003448 6.113170e-07\n",
|
|||
|
|
"2 2015-03-31 5656.707028 5656.722266 0.015238 2.693820e-06\n",
|
|||
|
|
"3 2015-04-30 5701.063923 5701.084181 0.020258 3.553334e-06\n",
|
|||
|
|
"4 2015-05-31 5728.963044 5728.994764 0.031720 5.536759e-06\n",
|
|||
|
|
" date raw_scores clean_scores improvement relative_improvement\n",
|
|||
|
|
"125 2025-06-30 11703.280129 11703.402856 0.122727 0.000010\n",
|
|||
|
|
"126 2025-07-31 11855.613232 11855.796565 0.183333 0.000015\n",
|
|||
|
|
"127 2025-08-31 11962.226141 11962.553334 0.327193 0.000027\n",
|
|||
|
|
"128 2025-09-30 12046.634770 12046.966956 0.332185 0.000028\n",
|
|||
|
|
"129 2025-10-31 12189.843400 12190.351082 0.507683 0.000042\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# RUN DATA CHALLENGE ON RAW AND CLEAN DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"DATASETS = {\n",
|
|||
|
|
" \"raw\": \"stocks.csv\",\n",
|
|||
|
|
" \"clean\": \"stock_repaired.csv\"\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"results = {}\n",
|
|||
|
|
"code_changes_results = {}\n",
|
|||
|
|
"\n",
|
|||
|
|
"for name, file in DATASETS.items():\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\n====================================\")\n",
|
|||
|
|
" print(\"RUNNING DATA CHALLENGE ON:\", name)\n",
|
|||
|
|
" print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # LOAD DATA\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks = pd.read_csv(file, low_memory=False)\n",
|
|||
|
|
" flows = pd.read_csv(\"flows.csv\", low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
" flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks[\"Registrar Account - ID\"] = stocks[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
" flows[\"Registrar Account - ID\"] = flows[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks[\"Product - Isin\"] = stocks[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
" flows[\"Product - Isin\"] = flows[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # BUILD DATASET\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # ACCOUNTING IDENTITY\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" ).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"gap\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"rel_error\"] = (\n",
|
|||
|
|
" df[\"gap\"].abs() /\n",
|
|||
|
|
" df[\"expected_aum\"].abs().clip(lower=1)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # ACCOUNT LEVEL ERROR\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" account_error = (\n",
|
|||
|
|
" df.groupby([\"Centralisation Date\",\"Registrar Account - ID\"])\n",
|
|||
|
|
" [\"rel_error\"]\n",
|
|||
|
|
" .mean()\n",
|
|||
|
|
" .reset_index()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" RUPTURE_THRESHOLD = 0.5\n",
|
|||
|
|
"\n",
|
|||
|
|
" ruptures = account_error[\n",
|
|||
|
|
" account_error[\"rel_error\"] > RUPTURE_THRESHOLD\n",
|
|||
|
|
" ]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # BUILD PORTFOLIO MATRICES\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" portfolio = (\n",
|
|||
|
|
" stocks\n",
|
|||
|
|
" .pivot_table(\n",
|
|||
|
|
" index=[\"Centralisation Date\",\"Registrar Account - ID\"],\n",
|
|||
|
|
" columns=\"Product - Isin\",\n",
|
|||
|
|
" values=\"Quantity - AUM\",\n",
|
|||
|
|
" aggfunc=\"sum\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" flows_matrix = (\n",
|
|||
|
|
" flows\n",
|
|||
|
|
" .pivot_table(\n",
|
|||
|
|
" index=[\"Centralisation Date\",\"Registrar Account - ID\"],\n",
|
|||
|
|
" columns=\"Product - Isin\",\n",
|
|||
|
|
" values=\"Quantity - NetFlows\",\n",
|
|||
|
|
" aggfunc=\"sum\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # SCORE COMPUTATION\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" error_matrix = (\n",
|
|||
|
|
" account_error\n",
|
|||
|
|
" .pivot(\n",
|
|||
|
|
" index=\"Centralisation Date\",\n",
|
|||
|
|
" columns=\"Registrar Account - ID\",\n",
|
|||
|
|
" values=\"rel_error\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .fillna(0)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" dates = error_matrix.index.values\n",
|
|||
|
|
" errors = error_matrix.values\n",
|
|||
|
|
"\n",
|
|||
|
|
" scores = np.ones(errors.shape[1])\n",
|
|||
|
|
"\n",
|
|||
|
|
" score_history = []\n",
|
|||
|
|
"\n",
|
|||
|
|
" for i in range(len(dates)-1, -1, -1):\n",
|
|||
|
|
"\n",
|
|||
|
|
" quality = np.exp(-5 * errors[i])\n",
|
|||
|
|
"\n",
|
|||
|
|
" scores = scores * quality\n",
|
|||
|
|
"\n",
|
|||
|
|
" score_history.append({\n",
|
|||
|
|
" \"date\": dates[i],\n",
|
|||
|
|
" \"sum_scores\": scores.sum()\n",
|
|||
|
|
" })\n",
|
|||
|
|
"\n",
|
|||
|
|
" score_timeline = (\n",
|
|||
|
|
" pd.DataFrame(score_history)\n",
|
|||
|
|
" .sort_values(\"date\")\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" initial_score = score_timeline[\"sum_scores\"].iloc[-1]\n",
|
|||
|
|
"\n",
|
|||
|
|
" score_timeline[\"score_retention\"] = (\n",
|
|||
|
|
" score_timeline[\"sum_scores\"] / initial_score\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" results[name] = score_timeline.copy()\n",
|
|||
|
|
"\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
" # CODE SURGERY\n",
|
|||
|
|
" # ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes = detect_code_changes_fast(\n",
|
|||
|
|
" portfolio,\n",
|
|||
|
|
" flows_matrix,\n",
|
|||
|
|
" ruptures,\n",
|
|||
|
|
" score_timeline\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" code_changes_results[name] = code_changes\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# BUILD COMPARISON TABLE\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"comparison = (\n",
|
|||
|
|
" results[\"raw\"][[\"date\", \"sum_scores\"]]\n",
|
|||
|
|
" .rename(columns={\"sum_scores\": \"raw_scores\"})\n",
|
|||
|
|
" .merge(\n",
|
|||
|
|
" results[\"clean\"][[\"date\", \"sum_scores\"]]\n",
|
|||
|
|
" .rename(columns={\"sum_scores\": \"clean_scores\"}),\n",
|
|||
|
|
" on=\"date\",\n",
|
|||
|
|
" how=\"outer\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" .sort_values(\"date\")\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"comparison[\"improvement\"] = (\n",
|
|||
|
|
" comparison[\"clean_scores\"] - comparison[\"raw_scores\"]\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"comparison[\"relative_improvement\"] = (\n",
|
|||
|
|
" comparison[\"improvement\"] /\n",
|
|||
|
|
" comparison[\"raw_scores\"].replace(0, np.nan)\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\n==============================\")\n",
|
|||
|
|
"print(\"RAW VS CLEAN SCORE COMPARISON\")\n",
|
|||
|
|
"print(\"==============================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(comparison.head())\n",
|
|||
|
|
"print(comparison.tail())"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 7,
|
|||
|
|
"id": "15c60063-b8e5-43f0-a662-67a2f5601408",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"TOTAL RAW SCORE : 1102753.964620689\n",
|
|||
|
|
"TOTAL CLEAN SCORE : 1102778.1794964853\n",
|
|||
|
|
"ABSOLUTE IMPROVEMENT : 24.21487579634413\n",
|
|||
|
|
"RELATIVE IMPROVEMENT : 2.1958547938363796e-05\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"total_raw = comparison[\"raw_scores\"].sum()\n",
|
|||
|
|
"total_clean = comparison[\"clean_scores\"].sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
"absolute_gain = total_clean - total_raw\n",
|
|||
|
|
"relative_gain = absolute_gain / total_raw\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"TOTAL RAW SCORE :\", total_raw)\n",
|
|||
|
|
"print(\"TOTAL CLEAN SCORE :\", total_clean)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"ABSOLUTE IMPROVEMENT :\", absolute_gain)\n",
|
|||
|
|
"print(\"RELATIVE IMPROVEMENT :\", relative_gain)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 8,
|
|||
|
|
"id": "28f02823-483c-4ed8-bf24-912c5ca7f7d3",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAHDCAYAAAAqdvv1AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAf1BJREFUeJzs3Xd4FXXaxvHvnHQCSQikQgihJ7RAKAm9BCJNUCwoqwgoroKKWLGgWBfWio3XXQVUUMFVVFQggvRI772EThIgpELqmfePmCMxgEETTsr9uS6va8/M78x5ZvKcbG5m5jeGaZomIiIiIiIi8qcs9i5ARERERESkolCAEhERERERKSEFKBERERERkRJSgBIRERERESkhBSgREREREZESUoASEREREREpIQUoERERERGRElKAEhERERERKSEFKBERERERkRJSgBIRkVJx1113Ub9+fXuXIX9Tjx496NGjh73LKBWHDx/GMAxmzpz5l94/c+ZMDMPg8OHDpVqXiFRsClAiYnfbt2/npptuIjg4GFdXV+rUqUOfPn1455137F2aiMhfMmfOHN566y17lyEiZUABSkTsas2aNbRr146tW7dyzz338O6773L33XdjsVh4++237V2eXIX//Oc/7N27195liJQLClAilZejvQsQkart5ZdfxtPTk/Xr1+Pl5VVkXVJS0jWt5fz581SrVu2afmZlkJmZibu7O05OTvYupVSoD0RE5Ep0BkpE7OrgwYM0b968WHgC8PX1Lbbss88+o0OHDlSrVo2aNWvSrVs3Fi9eXGTM+++/T/PmzXFxcSEwMJCxY8eSkpJSZEyPHj1o0aIFGzdupFu3blSrVo2nnnoKgOzsbJ577jkaNWqEi4sLQUFBPP7442RnZ5don9auXUv//v2pWbMm7u7utGrVqtjZtKVLl9K1a1fc3d3x8vJi8ODB7N69u8iY559/HsMw2LdvH//4xz/w9PTEx8eHZ599FtM0OXbsGIMHD8bDwwN/f39ef/31Iu9ftmwZhmHw5Zdf8tRTT+Hv74+7uzvXX389x44dKzJ25cqV3HzzzdSrV8+2zw8//DAXLlwoMu6uu+6ievXqHDx4kP79+1OjRg2GDx9uW/fHe6C++OILIiIiqFGjBh4eHrRs2bLYsTh06BA333wz3t7eVKtWjcjISH744YdL7svcuXN5+eWXqVu3Lq6urvTu3ZsDBw4UGXv+/Hn27NnDmTNnrvBTKnClPvj2228ZMGAAgYGBuLi40LBhQ1588UXy8/Nt7582bRoODg5F+uv111/HMAwmTJhgW5afn0+NGjV44oknLlvLwIEDadCgwSXXRUVF0a5dO9vr2NhYunTpgpeXF9WrV6dp06a2uv9MSb5Df1TS78SMGTPo1asXvr6+uLi4EBYWxgcffFBse/Xr12fgwIGsWrWKDh064OrqSoMGDfjkk0+KjU1JSWH8+PEEBQXh4uJCo0aNmDJlClartdi4u+66C09PT7y8vBgxYkSx7/2V7Ny5k169euHm5kbdunV56aWXin0GlKwvevTowQ8//MCRI0cwDAPDMGzfjZycHCZNmkRERASenp64u7vTtWtXfvnllxLXKiL2pTNQImJXwcHBxMXFsWPHDlq0aHHFsZMnT+b555+nU6dOvPDCCzg7O7N27VqWLl1K3759gYLQMXnyZKKjo7nvvvvYu3cvH3zwAevXr2f16tVFzpKcPXuWfv36MWzYMP7xj3/g5+eH1Wrl+uuvZ9WqVYwZM4bQ0FC2b9/Om2++yb59+5g/f/4Va4yNjWXgwIEEBATw0EMP4e/vz+7du1mwYAEPPfQQAD///DP9+vWjQYMGPP/881y4cIF33nmHzp07s2nTpmIh5NZbbyU0NJR//etf/PDDD7z00kt4e3vzf//3f/Tq1YspU6Ywe/ZsHn30Udq3b0+3bt2KvP/ll1/GMAyeeOIJkpKSeOutt4iOjmbLli24ubkBMG/ePM6fP899991HrVq1WLduHe+88w7Hjx9n3rx5RbaXl5dHTEwMXbp04bXXXrvs2ZrY2Fhuu+02evfuzZQpUwDYvXs3q1evth2LxMREOnXqxPnz53nwwQepVasWs2bN4vrrr+err77ihhtuKLLNf/3rX1gsFh599FFSU1OZOnUqw4cPZ+3atbYx69ato2fPnjz33HM8//zzV/x5waX7AAomEKhevToTJkygevXqLF26lEmTJpGWlsa///1vALp27YrVamXVqlUMHDgQKAijFouFlStX2j5j8+bNZGRkFPvZXOzWW2/lzjvvZP369bRv3962/MiRI/z666+2z9y5cycDBw6kVatWvPDCC7i4uHDgwAFWr179p/taku/QH13Nd+KDDz6gefPmXH/99Tg6OvL9999z//33Y7VaGTt2bJHtHjhwgJtuuonRo0czYsQIPv74Y+666y4iIiJo3rw5UBCGu3fvzokTJ7j33nupV68ea9asYeLEiZw6dcp2iZxpmgwePJhVq1bxz3/+k9DQUL755htGjBjxp8cEICEhgZ49e5KXl8eTTz6Ju7s7H374oe37cbGS9MXTTz9Namoqx48f58033wSgevXqAKSlpfHf//6X2267jXvuuYf09HQ++ugjYmJiWLduHeHh4SWqWUTsyBQRsaPFixebDg4OpoODgxkVFWU+/vjj5qJFi8ycnJwi4/bv329aLBbzhhtuMPPz84uss1qtpmmaZlJSkuns7Gz27du3yJh3333XBMyPP/7Ytqx79+4mYE6fPr3Itj799FPTYrGYK1euLLJ8+vTpJmCuXr36svuSl5dnhoSEmMHBwea5c+cuWaNpmmZ4eLjp6+trnj171rZs69atpsViMe+8807bsueee84EzDFjxhT5jLp165qGYZj/+te/bMvPnTtnurm5mSNGjLAt++WXX0zArFOnjpmWlmZbPnfuXBMw3377bduy8+fPF9ufV1991TQMwzxy5Iht2YgRI0zAfPLJJ4uNHzFihBkcHGx7/dBDD5keHh5mXl5esbGFxo8fbwJFjnd6eroZEhJi1q9f3/ZzLNyX0NBQMzs72zb27bffNgFz+/btxfb7ueeeu+znFrpcH5jmpY/Jvffea1arVs3MysoyTdM08/PzTQ8PD/Pxxx83TbPg51yrVi3z5ptvNh0cHMz09HTTNE3zjTfeMC0WS7G+uFhqaqrp4uJiPvLII0WWT506tcjP4c033zQB8/Tp03+6fxcryXfINAuOSffu3W2vr+Y7caljFhMTYzZo0KDIsuDgYBMwV6xYYVuWlJRUbP9ffPFF093d3dy3b1+R9z/55JOmg4ODefToUdM0TXP+/PkmYE6dOtU2Ji8vz+zatasJmDNmzLjcYTFN8/c+XLt2bZF6PD09TcCMj4+/4j7+sS9M0zQHDBhQ5PtwcV0X97BpFnx//fz8zFGjRl2xThEpH3QJn4jYVZ8+fYiLi+P6669n69atTJ06lZiYGOrUqcN3331nGzd//nysViuTJk3CYin6q8swDKDgzE5OTg7jx48vMuaee+7Bw8Oj2GVhLi4ujBw5ssiyefPmERoaSrNmzThz5oztv169egFc8TKbzZs3Ex8fz/jx44tdklhY46lTp9iyZQt33XUX3t7etvWtWrWiT58+/Pjjj8W2e/fdd9v+t4ODA+3atcM0TUaPHm1b7uXlRdOmTTl06FCx9995553UqFHD9vqmm24iICCgyGdd/C/tmZmZnDlzhk6dOmGaJps3by62zfvuu++yx+HimjIzM4mNjb3smB9//JEOHTrQpUsX27Lq1aszZswYDh8+zK5du4qMHzlyJM7OzrbXXbt2BSiy3z169MA0zRKdfYJL9wEUPSbp6emcOXOGrl272i4RBLBYLHTq1IkVK1YABWfYzp49y5NPPolpmsTFxQEFZ6VatGhxyUtVC3l4eNCvXz/mzp2LaZq25V9++SWRkZHUq1cPwLaNb7/99pKXmF1OSb5Dl3I134mLj1lqaipnzpyhe/fuHDp0iNTU1CLbDQsLs/38AHx8fIr18Lx58+jatSs1a9Ys8tnR0dHk5+fbjvuPP/6Io6Njkb5
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 1000x500 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"plt.figure(figsize=(10,5))\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.plot(comparison[\"date\"], comparison[\"raw_scores\"], label=\"Raw\")\n",
|
|||
|
|
"plt.plot(comparison[\"date\"], comparison[\"clean_scores\"], label=\"Clean\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.legend()\n",
|
|||
|
|
"plt.title(\"Score comparison: raw vs cleaned data\")\n",
|
|||
|
|
"plt.grid(alpha=0.3)\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 14,
|
|||
|
|
"id": "a002234f-deec-4283-8245-ffac74f1930b",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"[None]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 14,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"import os\n",
|
|||
|
|
"import s3fs\n",
|
|||
|
|
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N0C5PK75FDX2WXI8OVP1'\n",
|
|||
|
|
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'nZvC2urUkG7EvhDsFDyaOslqr160aoWMs+5MP3Ft'\n",
|
|||
|
|
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMEM1UEs3NUZEWDJXWEk4T1ZQMSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc3MzIyNzI3OCwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NzQ0MzY4OTksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc3MzIyNzI5OSwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiI5Mjc0ODgyMy04OTgzLTQzYjktYTZhNy0xYjhlNDdiOTRjNTUiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjRkODM3NWVmLTQwY2QtNDYyMi05NzIyLTI4YjhjZTQ2MWQ5YyIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.hl_SekvaH9A22PMb3W0VQBSNO67LnaneIuLC-X5XBnzOO5GLV61aocDRfYC6hvVVhdzyewSTtD2kvdyJdeu6qA'\n",
|
|||
|
|
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
|
|||
|
|
"fs = s3fs.S3FileSystem(\n",
|
|||
|
|
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
|||
|
|
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
|||
|
|
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
|||
|
|
" token = os.environ[\"AWS_SESSION_TOKEN\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"# 3️⃣ Upload du CSV\n",
|
|||
|
|
"local_file = \"stock_repaired.csv\"\n",
|
|||
|
|
"s3_path = \"projet-bdc-carmignac-g3\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"fs.put(local_file, s3_path)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 16,
|
|||
|
|
"id": "eeb8f32c-c717-4d48-85c5-248661b5a945",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\n",
|
|||
|
|
"====================================\n",
|
|||
|
|
"RUNNING COHERENCE SCORE ON: raw\n",
|
|||
|
|
"====================================\n",
|
|||
|
|
"GLOBAL SCORE: 0.756699778421513\n",
|
|||
|
|
"\n",
|
|||
|
|
"====================================\n",
|
|||
|
|
"RUNNING COHERENCE SCORE ON: clean\n",
|
|||
|
|
"====================================\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"ename": "KeyError",
|
|||
|
|
"evalue": "'Centralisation Date'",
|
|||
|
|
"output_type": "error",
|
|||
|
|
"traceback": [
|
|||
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812\u001b[39m, in \u001b[36mIndex.get_loc\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 3811\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m3812\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3813\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/index.pyx:167\u001b[39m, in \u001b[36mpandas._libs.index.IndexEngine.get_loc\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/index.pyx:196\u001b[39m, in \u001b[36mpandas._libs.index.IndexEngine.get_loc\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[39m, in \u001b[36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/hashtable_class_helper.pxi:7096\u001b[39m, in \u001b[36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[39m\u001b[34m()\u001b[39m\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m: 'Centralisation Date'",
|
|||
|
|
"\nThe above exception was the direct cause of the following exception:\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
|||
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[16]\u001b[39m\u001b[32m, line 45\u001b[39m\n\u001b[32m 39\u001b[39m \u001b[38;5;66;03m# --------------------------------------------------------\u001b[39;00m\n\u001b[32m 40\u001b[39m \u001b[38;5;66;03m# LOAD STOCKS\u001b[39;00m\n\u001b[32m 41\u001b[39m \u001b[38;5;66;03m# --------------------------------------------------------\u001b[39;00m\n\u001b[32m 43\u001b[39m stocks = pd.read_csv(stock_file, low_memory=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m---> \u001b[39m\u001b[32m45\u001b[39m stocks[\u001b[33m\"\u001b[39m\u001b[33mCentralisation Date\u001b[39m\u001b[33m\"\u001b[39m] = pd.to_datetime(\u001b[43mstocks\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCentralisation Date\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[32m 46\u001b[39m stocks[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m] = stocks[\u001b[33m\"\u001b[39m\u001b[33mRegistrar Account - ID\u001b[39m\u001b[33m\"\u001b[39m].astype(\u001b[38;5;28mstr\u001b[39m).str.strip()\n\u001b[32m 47\u001b[39m stocks[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m] = stocks[\u001b[33m\"\u001b[39m\u001b[33mProduct - Isin\u001b[39m\u001b[33m\"\u001b[39m].astype(\u001b[38;5;28mstr\u001b[39m)\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/frame.py:4113\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.columns.nlevels > \u001b[32m1\u001b[39m:\n\u001b[32m 4112\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._getitem_multilevel(key)\n\u001b[32m-> \u001b[39m\u001b[32m4113\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4114\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[32m 4115\u001b[39m indexer = [indexer]\n",
|
|||
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/pandas/core/indexes/base.py:3819\u001b[39m, in \u001b[36mIndex.get_loc\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 3814\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[32m 3815\u001b[39m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc.Iterable)\n\u001b[32m 3816\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[32m 3817\u001b[39m ):\n\u001b[32m 3818\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[32m-> \u001b[39m\u001b[32m3819\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01merr\u001b[39;00m\n\u001b[32m 3820\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[32m 3821\u001b[39m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[32m 3822\u001b[39m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[32m 3823\u001b[39m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[32m 3824\u001b[39m \u001b[38;5;28mself\u001b[39m._check_indexing_error(key)\n",
|
|||
|
|
"\u001b[31mKeyError\u001b[39m: 'Centralisation Date'"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"DATASETS = {\n",
|
|||
|
|
" \"raw\": \"stocks.csv\",\n",
|
|||
|
|
" \"clean\": \"stock_repaired.csv\"\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows_file = \"flows.csv\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"results = {}\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# LOAD FLOWS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = pd.read_csv(flows_file, low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"flows[\"Registrar Account - ID\"] = flows[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
"flows[\"Product - Isin\"] = flows[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows[\"Quantity - NetFlows\"] = flows[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# LOOP OVER DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"for name, stock_file in DATASETS.items():\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\n====================================\")\n",
|
|||
|
|
" print(\"RUNNING COHERENCE SCORE ON:\", name)\n",
|
|||
|
|
" print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # LOAD STOCKS\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks = pd.read_csv(stock_file, low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
" stocks[\"Registrar Account - ID\"] = stocks[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
" stocks[\"Product - Isin\"] = stocks[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # MERGE FLOWS\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNTING RELATION\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" ).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"error\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # NORMALIZED ERROR\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"scale\"] = df[\"prev_aum\"].abs().clip(lower=1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"normalized_error\"] = df[\"error\"].abs() / df[\"scale\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # OBSERVATION SCORE\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"score_obs\"] = np.exp(-5 * df[\"normalized_error\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNT SCORE\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" account_score = (\n",
|
|||
|
|
" df.groupby(\"Registrar Account - ID\")[\"score_obs\"]\n",
|
|||
|
|
" .mean()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNT WEIGHTS (31/10/2025)\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" last_date = stocks[\"Centralisation Date\"].max()\n",
|
|||
|
|
"\n",
|
|||
|
|
" aum_last = (\n",
|
|||
|
|
" stocks[stocks[\"Centralisation Date\"] == last_date]\n",
|
|||
|
|
" .groupby(\"Registrar Account - ID\")[\"Quantity - AUM\"]\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" weights = aum_last / aum_last.sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ALIGN\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" combined = pd.concat([account_score, weights], axis=1)\n",
|
|||
|
|
" combined.columns = [\"score\", \"weight\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" combined = combined.fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # GLOBAL SCORE\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" combined[\"weighted_score\"] = combined[\"score\"] * combined[\"weight\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" global_score = combined[\"weighted_score\"].sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"GLOBAL SCORE:\", global_score)\n",
|
|||
|
|
"\n",
|
|||
|
|
" results[name] = {\n",
|
|||
|
|
" \"score\": global_score,\n",
|
|||
|
|
" \"details\": combined\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# COMPARISON\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\n====================================\")\n",
|
|||
|
|
"print(\"RAW VS CLEAN COMPARISON\")\n",
|
|||
|
|
"print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"raw_score = results[\"raw\"][\"score\"]\n",
|
|||
|
|
"clean_score = results[\"clean\"][\"score\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"improvement = clean_score - raw_score\n",
|
|||
|
|
"relative = improvement / raw_score\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"RAW SCORE :\", raw_score)\n",
|
|||
|
|
"print(\"CLEAN SCORE :\", clean_score)\n",
|
|||
|
|
"print(\"IMPROVEMENT :\", improvement)\n",
|
|||
|
|
"print(\"RELATIVE IMPROVEMENT :\", relative)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "4ce0bfea-7714-4fd7-ba29-17ce8d976ab6",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"DATASETS = {\n",
|
|||
|
|
" \"clean\": \"stock_repaired.csv\"\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows_file = \"flows.csv\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"results = {}\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# LOAD FLOWS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows = pd.read_csv(flows_file, sep=\";\", low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"])\n",
|
|||
|
|
"flows[\"Registrar Account - ID\"] = flows[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
"flows[\"Product - Isin\"] = flows[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
"flows[\"Quantity - NetFlows\"] = flows[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# LOOP OVER DATASETS\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"for name, stock_file in DATASETS.items():\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"\\n====================================\")\n",
|
|||
|
|
" print(\"RUNNING COHERENCE SCORE ON:\", name)\n",
|
|||
|
|
" print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # LOAD STOCKS\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks = pd.read_csv(stock_file, sep=\";\", low_memory=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"])\n",
|
|||
|
|
" stocks[\"Registrar Account - ID\"] = stocks[\"Registrar Account - ID\"].astype(str).str.strip()\n",
|
|||
|
|
" stocks[\"Product - Isin\"] = stocks[\"Product - Isin\"].astype(str)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # MERGE FLOWS\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = stocks.merge(\n",
|
|||
|
|
" flows,\n",
|
|||
|
|
" on=[\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"],\n",
|
|||
|
|
" how=\"left\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"Quantity - NetFlows\"] = df[\"Quantity - NetFlows\"].fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df = df.sort_values(\n",
|
|||
|
|
" [\"Registrar Account - ID\",\"Product - Isin\",\"Centralisation Date\"]\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNTING RELATION\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"prev_aum\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - AUM\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"flow_lag\"] = (\n",
|
|||
|
|
" df.groupby([\"Registrar Account - ID\",\"Product - Isin\"])\n",
|
|||
|
|
" [\"Quantity - NetFlows\"]\n",
|
|||
|
|
" .shift(1)\n",
|
|||
|
|
" ).fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"expected_aum\"] = df[\"prev_aum\"] + df[\"flow_lag\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"error\"] = df[\"Quantity - AUM\"] - df[\"expected_aum\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # NORMALIZED ERROR\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"scale\"] = df[\"prev_aum\"].abs().clip(lower=1)\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"normalized_error\"] = df[\"error\"].abs() / df[\"scale\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # OBSERVATION SCORE\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" df[\"score_obs\"] = np.exp(-5 * df[\"normalized_error\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNT SCORE\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" account_score = (\n",
|
|||
|
|
" df.groupby(\"Registrar Account - ID\")[\"score_obs\"]\n",
|
|||
|
|
" .mean()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
" # ACCOUNT WEIGHTS (31/10/2025)\n",
|
|||
|
|
" # --------------------------------------------------------\n",
|
|||
|
|
"\n",
|
|||
|
|
" last_date = stocks[\"Centralisation Date\"].max()\n",
|
|||
|
|
"\n",
|
|||
|
|
" aum_last = (\n",
|
|||
|
|
" stocks[stocks[\"Centralisation Date\"] == last_date]\n",
|
|||
|
|
" .groupby(\"Registrar Account - ID\")[\"Quantity - AUM\"]\n",
|
|||
|
|
" .sum()\n",
|
|||
|
|
" )\n",
|
|||
|
|
"\n",
|
|||
|
|
" weights = aum_last / aum_last.sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
" combined = pd.concat([account_score, weights], axis=1)\n",
|
|||
|
|
" combined.columns = [\"score\", \"weight\"]\n",
|
|||
|
|
" combined = combined.fillna(0)\n",
|
|||
|
|
"\n",
|
|||
|
|
" combined[\"weighted_score\"] = combined[\"score\"] * combined[\"weight\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
" global_score = combined[\"weighted_score\"].sum()\n",
|
|||
|
|
"\n",
|
|||
|
|
" print(\"GLOBAL SCORE:\", global_score)\n",
|
|||
|
|
"\n",
|
|||
|
|
" results[name] = global_score\n",
|
|||
|
|
"\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"# COMPARISON\n",
|
|||
|
|
"# ============================================================\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\n====================================\")\n",
|
|||
|
|
"print(\"RAW VS CLEAN COMPARISON\")\n",
|
|||
|
|
"print(\"====================================\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"raw_score = results[\"raw\"]\n",
|
|||
|
|
"clean_score = results[\"clean\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"improvement = clean_score - raw_score\n",
|
|||
|
|
"relative = improvement / raw_score\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"RAW SCORE :\", raw_score)\n",
|
|||
|
|
"print(\"CLEAN SCORE :\", clean_score)\n",
|
|||
|
|
"print(\"IMPROVEMENT :\", improvement)\n",
|
|||
|
|
"print(\"RELATIVE IMPROVEMENT :\", relative)"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.13.11"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 5
|
|||
|
|
}
|