Project_Carmignac/dataloader.ipynb
2026-01-30 15:35:28 +00:00

1392 lines
67 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2e8cf88b-cecf-409f-9c2d-c3762b233f05",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n",
"Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip install openpyxl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ff2261fb-9516-4410-b42d-3acc8dc1a460",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n",
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n",
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n",
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
]
},
{
"cell_type": "markdown",
"id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5",
"metadata": {},
"source": [
"# data exploration"
]
},
{
"cell_type": "markdown",
"id": "eaf5c5a0-eb1c-4242-b893-7600e6def109",
"metadata": {},
"source": [
"Fonctions utiles"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "60e2035c-c2f0-4c51-97df-102e67ba96ee",
"metadata": {},
"outputs": [],
"source": [
"def plot_account(account_id, isin=None):\n",
" \"\"\"\n",
" Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n",
" Optionally, only for one ISIN.\n",
" \"\"\"\n",
"\n",
" df = merged.copy()\n",
"\n",
" # Filter by account\n",
" df = df[df[\"Registrar Account - ID\"] == account_id]\n",
"\n",
" if isin is not None:\n",
" df = df[df[\"Product - Isin\"] == isin]\n",
"\n",
" if df.empty:\n",
" print(f\"No data found for account {account_id}\")\n",
" return\n",
"\n",
" df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n",
"\n",
" df_plot = df_plot.sort_values(\"Centralisation Date\")\n",
"\n",
" # Plot\n",
" plt.figure(figsize=(12, 4))\n",
" plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n",
" plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n",
" plt.xlabel(\"Date\")\n",
" plt.ylabel(\"Total AUM\")\n",
" plt.grid(True)\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "37e008b1-32d4-44be-9d23-1b90a5a26f89",
"metadata": {},
"outputs": [],
"source": [
"# 2. BASIC INSPECTION\n",
"\n",
"def quick_info(df, name):\n",
" print(\"\\n\" + \"=\"*80)\n",
" print(f\"DATASET : {name}\")\n",
" print(\"=\"*80)\n",
" print(\"\\nShape :\", df.shape)\n",
" print(\"\\nColumns :\", df.columns.tolist())\n",
" print(\"\\nDtypes :\\n\", df.dtypes)\n",
" print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n",
" print(\"\\nSample rows:\\n\", df.head(5))\n",
" print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e104a416-4cfd-43b9-b9ec-6af1fce700da",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'R9J6QLRZDYWLSWKBJ7IA'\n",
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'sFi4J78xigTJEXdY02bQL2i5KNwyYL7VZpMe0XJs'\n",
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJSOUo2UUxSWkRZV0xTV0tCSjdJQSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2NDM0Mzc0MSwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjU1NTM4NjYsImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2NDM0NDI2NiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiIxZWEzOTgwMi0zZGNiLTRlMzEtYTkwMS1jNGE4M2Q5ZjQyYzQiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjZlMDBhN2QxLTMxYWQtNGE3Ny04ZmE3LTBlYTc1Y2VhZTQwMCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.gUxGEq4iWA5eJVWYj-JyDZGYzTnWmQH92iYU-2_5P4n3erGFFZvz8wJe5keTAhcKKTycZKiWltnZpCNsRQ0vOg'\n",
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e67a99ea-ddf4-4627-8f48-ec183c671acb",
"metadata": {},
"outputs": [
{
"ename": "PermissionError",
"evalue": "Forbidden",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mClientError\u001b[39m Traceback (most recent call last)",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:114\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/context.py:36\u001b[39m, in \u001b[36mwith_current_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m resolve_awaitable(hook())\n\u001b[32m---> \u001b[39m\u001b[32m36\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/client.py:424\u001b[39m, in \u001b[36mAioBaseClient._make_api_call\u001b[39m\u001b[34m(self, operation_name, api_params)\u001b[39m\n\u001b[32m 423\u001b[39m error_class = \u001b[38;5;28mself\u001b[39m.exceptions.from_code(error_code)\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[32m 425\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"\u001b[31mClientError\u001b[39m: An error occurred (403) when calling the HeadObject operation: Forbidden",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[31mPermissionError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[33;03mwith fs.open(\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[33;03m \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 6\u001b[39m \u001b[33;03m flows = pd.read_csv(f, sep=\";\")\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mprojet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 10\u001b[39m nav_raw = pd.read_excel(f, header=\u001b[38;5;28;01mNone\u001b[39;00m, engine=\u001b[33m\"\u001b[39m\u001b[33mopenpyxl\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 11\u001b[39m nav = nav_raw[\u001b[32m0\u001b[39m].str.split(\u001b[33m\"\u001b[39m\u001b[33m,\u001b[39m\u001b[33m\"\u001b[39m, expand=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1338\u001b[39m, in \u001b[36mAbstractFileSystem.open\u001b[39m\u001b[34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[39m\n\u001b[32m 1336\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1337\u001b[39m ac = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33mautocommit\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._intrans)\n\u001b[32m-> \u001b[39m\u001b[32m1338\u001b[39m f = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1339\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1340\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1341\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1342\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1343\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1344\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1345\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1347\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfsspec\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcompression\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m compr\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:720\u001b[39m, in \u001b[36mS3FileSystem._open\u001b[39m\u001b[34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, size, requester_pays, cache_options, **kwargs)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 718\u001b[39m cache_type = \u001b[38;5;28mself\u001b[39m.default_cache_type\n\u001b[32m--> \u001b[39m\u001b[32m720\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 721\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 722\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 723\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 724\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 725\u001b[39m \u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[43m=\u001b[49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 726\u001b[39m \u001b[43m \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 727\u001b[39m \u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 728\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 729\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 730\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 731\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 732\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 733\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 734\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:2257\u001b[39m, in \u001b[36mS3File.__init__\u001b[39m\u001b[34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options, size)\u001b[39m\n\u001b[32m 2255\u001b[39m \u001b[38;5;28mself\u001b[39m.details = s3.info(path)\n\u001b[32m 2256\u001b[39m \u001b[38;5;28mself\u001b[39m.version_id = \u001b[38;5;28mself\u001b[39m.details.get(\u001b[33m\"\u001b[39m\u001b[33mVersionId\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2257\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 2258\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2259\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2260\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2261\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2262\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2263\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2264\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2265\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2266\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2267\u001b[39m \u001b[38;5;28mself\u001b[39m.s3 = \u001b[38;5;28mself\u001b[39m.fs \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[32m 2269\u001b[39m \u001b[38;5;66;03m# when not using autocommit we want to have transactional state to manage\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1912\u001b[39m, in \u001b[36mAbstractBufferedFile.__init__\u001b[39m\u001b[34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[39m\n\u001b[32m 1910\u001b[39m \u001b[38;5;28mself\u001b[39m.size = size\n\u001b[32m 1911\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1912\u001b[39m \u001b[38;5;28mself\u001b[39m.size = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdetails\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33msize\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1913\u001b[39m \u001b[38;5;28mself\u001b[39m.cache = caches[cache_type](\n\u001b[32m 1914\u001b[39m \u001b[38;5;28mself\u001b[39m.blocksize, \u001b[38;5;28mself\u001b[39m._fetch_range, \u001b[38;5;28mself\u001b[39m.size, **cache_options\n\u001b[32m 1915\u001b[39m )\n\u001b[32m 1916\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1925\u001b[39m, in \u001b[36mAbstractBufferedFile.details\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1922\u001b[39m \u001b[38;5;129m@property\u001b[39m\n\u001b[32m 1923\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 1924\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1925\u001b[39m \u001b[38;5;28mself\u001b[39m._details = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1926\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:118\u001b[39m, in \u001b[36msync_wrapper.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 115\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 116\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args, **kwargs):\n\u001b[32m 117\u001b[39m \u001b[38;5;28mself\u001b[39m = obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[32m0\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:103\u001b[39m, in \u001b[36msync\u001b[39m\u001b[34m(loop, func, timeout, *args, **kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m FSTimeoutError \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreturn_result\u001b[39;00m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[32m 104\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m return_result\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:56\u001b[39m, in \u001b[36m_runner\u001b[39m\u001b[34m(event, coro, result, timeout)\u001b[39m\n\u001b[32m 54\u001b[39m coro = asyncio.wait_for(coro, timeout=timeout)\n\u001b[32m 55\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m result[\u001b[32m0\u001b[39m] = \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m 58\u001b[39m result[\u001b[32m0\u001b[39m] = ex\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:1445\u001b[39m, in \u001b[36mS3FileSystem._info\u001b[39m\u001b[34m(self, path, bucket, key, refresh, version_id)\u001b[39m\n\u001b[32m 1443\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key:\n\u001b[32m 1444\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1445\u001b[39m out = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_s3(\n\u001b[32m 1446\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mhead_object\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1447\u001b[39m \u001b[38;5;28mself\u001b[39m.kwargs,\n\u001b[32m 1448\u001b[39m Bucket=bucket,\n\u001b[32m 1449\u001b[39m Key=key,\n\u001b[32m 1450\u001b[39m **version_id_kw(version_id),\n\u001b[32m 1451\u001b[39m **\u001b[38;5;28mself\u001b[39m.req_kw,\n\u001b[32m 1452\u001b[39m )\n\u001b[32m 1453\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m 1454\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1455\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m (...)\u001b[39m\u001b[32m 1461\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1462\u001b[39m }\n\u001b[32m 1463\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:371\u001b[39m, in \u001b[36mS3FileSystem._call_s3\u001b[39m\u001b[34m(self, method, *akwarglist, **kwargs)\u001b[39m\n\u001b[32m 369\u001b[39m logger.debug(\u001b[33m\"\u001b[39m\u001b[33mCALL: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, method.\u001b[34m__name__\u001b[39m, akwarglist, kw2)\n\u001b[32m 370\u001b[39m additional_kwargs = \u001b[38;5;28mself\u001b[39m._get_s3_method_kwargs(method, *akwarglist, **kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(\n\u001b[32m 372\u001b[39m method, kwargs=additional_kwargs, retries=\u001b[38;5;28mself\u001b[39m.retries\n\u001b[32m 373\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:146\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 144\u001b[39m err = e\n\u001b[32m 145\u001b[39m err = translate_boto_error(err)\n\u001b[32m--> \u001b[39m\u001b[32m146\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m err\n",
"\u001b[31mPermissionError\u001b[39m: Forbidden"
]
}
],
"source": [
"\"\"\"\n",
"with fs.open(\n",
" \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\n",
" \"rb\"\n",
") as f:\n",
" flows = pd.read_csv(f, sep=\";\")\n",
"\"\"\"\n",
"\n",
"with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n",
" nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n",
"nav = nav_raw[0].str.split(\",\", expand=True)\n",
"nav.columns = nav.iloc[0]\n",
"with fs.open('projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
" stocks = pd.read_csv(f, sep=\";\")\n",
"\n",
"nav = nav[1:].reset_index(drop=True)\n",
"\n",
"quick_info(stocks, \"STOCKS\")\n",
"quick_info(flows, \"FLOWS\")\n",
"quick_info(nav, \"NAV/PRICES\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6",
"metadata": {},
"outputs": [],
"source": [
"# 1. CLEAN DATES (formats différents)\n",
"\n",
"stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n",
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n",
"nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n",
"\n",
"print(\"Date conversion done.\")\n",
"\n",
"# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n",
"\n",
"num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n",
" \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n",
"\n",
"for col in num_cols:\n",
" nav[col] = (\n",
" nav[col]\n",
" .astype(str)\n",
" .str.replace(\",\", \".\", regex=False)\n",
" .str.replace(\" \", \"\")\n",
" .astype(float)\n",
" )\n",
"\n",
"print(\"NAV numeric conversion done.\")\n",
"\n",
"# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n",
"\n",
"def norm(df):\n",
" for col in df.columns:\n",
" if df[col].dtype == \"object\":\n",
" df[col] = df[col].astype(str).str.strip().str.upper()\n",
" return df\n",
"\n",
"stocks = norm(stocks)\n",
"flows = norm(flows)\n",
"nav = norm(nav)\n",
"\n",
"print(\"String normalization done.\")\n",
"\n",
"\n",
"# 4. ANALYSE RELATIONS ACROSS FILES\n",
"\n",
"# Unique sets\n",
"isin_stocks = set(stocks[\"Product - Isin\"].unique())\n",
"isin_flows = set(flows[\"Product - Isin\"].unique())\n",
"isin_nav = set(nav[\"ShareClassIsin\"].unique())\n",
"\n",
"print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n",
"print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n",
"print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n",
"print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n",
"\n",
"\n",
"# 5. CLIENTS: STOCKS VS FLOWS\n",
"\n",
"acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n",
"acc_flows = set(flows[\"Registrar Account - ID\"].unique())\n",
"\n",
"print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n",
"print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n",
"\n",
"\n",
"# 6. CLIENT ACTIVITY METRICS (DETAILED)\n",
"\n",
"client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n",
" n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
" n_transactions=(\"Quantity - NetFlows\", \"count\"),\n",
" total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
" mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n",
" std_flow=(\"Quantity - NetFlows\", \"std\"),\n",
" total_subscription=(\"Quantity - Subscription\", \"sum\"),\n",
" total_redemption=(\"Quantity - Redemption\", \"sum\")\n",
").reset_index()\n",
"\n",
"# Add churn metric\n",
"client_behavior[\"churn_ratio\"] = (\n",
" client_behavior[\"total_redemption\"] /\n",
" (client_behavior[\"total_subscription\"] + 1e-9)\n",
")\n",
"\n",
"print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n",
"\n",
"\n",
"# 7. FUNDS ACTIVITY METRICS\n",
"\n",
"fund_behavior = flows.groupby(\"Product - Isin\").agg(\n",
" n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n",
" n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
" total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
" vol_flows=(\"Quantity - NetFlows\", \"std\")\n",
").reset_index()\n",
"\n",
"print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n",
"\n",
"\n",
"# 8. SAVE INTERMEDIATE\n",
"\n",
"client_behavior.to_csv(\"client_behavior.csv\", index=False)\n",
"fund_behavior.to_csv(\"fund_behavior.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a",
"metadata": {},
"outputs": [],
"source": [
"valid_full = isin_stocks & isin_flows & isin_nav\n",
"stocks_only = isin_stocks - isin_flows\n",
"flows_only = isin_flows - isin_stocks\n",
"missing_nav = (isin_stocks | isin_flows) - isin_nav\n",
"\n",
"print(\"FULL usable ISIN :\", len(valid_full))\n",
"print(\"Stocks only ISIN :\", len(stocks_only))\n",
"print(\"Flows only ISIN :\", len(flows_only))\n",
"print(\"Missing NAV :\", len(missing_nav))\n",
"\n",
"pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n",
"pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n",
"pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n",
"pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n",
"\n",
"print(\"All ISIN groups saved into 4 separate files.\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a",
"metadata": {},
"outputs": [],
"source": [
"eps = 1e-6\n",
"\n",
"client_behavior[\"churn_ratio\"] = (\n",
" client_behavior[\"total_redemption\"] /\n",
" (client_behavior[\"total_subscription\"] + eps)\n",
")\n",
"\n",
"client_behavior[\"churn_flag\"] = (\n",
" client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n",
").astype(int)\n",
"\n",
"client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n",
"\n",
"client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n",
"\n",
"client_behavior[\"inertia_ratio\"] = (\n",
" 1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n",
")\n",
"\n",
"print(client_behavior.head())\n",
"\n",
"client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ee7e911-eb73-4846-b545-661140411c1b",
"metadata": {},
"outputs": [],
"source": [
"# Diversification per account\n",
"account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n",
" n_isin_held=(\"Product - Isin\", \"nunique\"),\n",
" n_funds_held=(\"Product - Fund\", \"nunique\"),\n",
" n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n",
" n_strategies=(\"Product - Strategy\", \"nunique\"),\n",
" total_aum=(\"Value - AUM €\", \"sum\"),\n",
" median_aum=(\"Value - AUM €\", \"median\")\n",
").reset_index()\n",
"\n",
"# Concentration ratio per account\n",
"aum_by_account_fund = stocks.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Fund\"]\n",
")[\"Value - AUM €\"].sum().reset_index()\n",
"\n",
"concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n",
" .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n",
" .reset_index(name=\"concentration_ratio\")\n",
"\n",
"# Merge diversification + concentration\n",
"account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"print(account_static.head())\n",
"print(account_static.describe())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76f6fa0d-9d7a-4145-af1c-986d83947f91",
"metadata": {},
"outputs": [],
"source": [
"# Geographic info per account\n",
"geo = stocks.groupby(\"Registrar Account - ID\").agg(\n",
" country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n",
" region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n",
").reset_index()\n",
"\n",
"print(geo.head())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c",
"metadata": {},
"outputs": [],
"source": [
"# 1. Merge behavior (flows) with static diversification (stocks)\n",
"client_master = client_behavior.merge(\n",
" account_static,\n",
" on=\"Registrar Account - ID\",\n",
" how=\"left\"\n",
")\n",
"\n",
"# 2. Add geographic info\n",
"client_master = client_master.merge(\n",
" geo,\n",
" on=\"Registrar Account - ID\",\n",
" how=\"left\"\n",
")\n",
"\n",
"# 3. Create additional engineered features\n",
"client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n",
"client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n",
"\n",
"\n",
"# 4. Replace NaN flow volatility with 0 (inactive accounts)\n",
"client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n",
"\n",
"# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n",
"client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n",
" client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n",
"\n",
"# 6. Fill missing geography as “UNKNOWN”\n",
"client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n",
"client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n",
"\n",
"# 7. Export\n",
"client_master.to_csv(\"client_master.csv\", index=False)\n",
"\n",
"print(client_master.head())\n",
"print(client_master.describe(include='all'))\n"
]
},
{
"cell_type": "markdown",
"id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839",
"metadata": {},
"source": [
"Détection des ruptures"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. PREPARE STOCKS ---\n",
"stocks_clean = stocks[[\n",
" \"Registrar Account - ID\", \"Product - Isin\", \n",
" \"Centralisation Date\", \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
"stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n",
"\n",
"# --- 2. PREPARE FLOWS ---\n",
"flows_clean = flows[[\n",
" \"Registrar Account - ID\", \"Product - Isin\", \n",
" \"Centralisation Date\", \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
"\n",
"# Aggregate flows per day to avoid duplicates\n",
"flows_clean = flows_clean.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")[\"Quantity - NetFlows\"].sum().reset_index()\n",
"\n",
"# --- 3. MERGE STOCKS WITH FLOWS ---\n",
"merged = stocks_clean.merge(\n",
" flows_clean,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n",
"merged[\"prev_stock\"] = merged.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"# SHIFT NET FLOWS FROM PREVIOUS DATE\n",
"merged[\"prev_netflows\"] = merged.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"# Expected stock\n",
"merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n",
"\n",
"# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n",
"merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n",
"\n",
"# tolerance for numerical noise\n",
"TOL = 1e-6\n",
"merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n",
"\n",
"# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n",
"rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" total_obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
").reset_index()\n",
"\n",
"# Sort by biggest anomalies\n",
"rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n",
"\n",
"rupture_summary.head(10)\n",
"\n",
"rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n",
"rupture_summary_asc.to_csv('rupture.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06",
"metadata": {},
"outputs": [],
"source": [
"merged.to_csv('merged.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71cd67aa-f4b9-489e-b928-defeca459cb6",
"metadata": {},
"outputs": [],
"source": [
"rupture_summary_asc"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72332a7e-0ab0-474b-aac7-b52ebbba7a8b",
"metadata": {},
"outputs": [],
"source": [
"plot_account('200001928')\n",
"plot_account('366351')\n",
"plot_account('365966')\n",
"plot_account('365568')\n",
"plot_account('200129601')\n",
"plot_account('402410')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "31407450-a833-4fce-8b0b-dba1b1de585f",
"metadata": {},
"outputs": [],
"source": [
"# 1. Prepare stock dataset ISIN-by-ISIN\n",
"stocks_isin = stocks[[\n",
" \"Registrar Account - ID\", \"Product - Isin\",\n",
" \"Centralisation Date\", \"Quantity - AUM\"\n",
"]].copy()\n",
"\n",
"stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
"stocks_isin = stocks_isin.sort_values(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")\n",
"\n",
"# 2. Prepare flows dataset ISIN-by-ISIN\n",
"flows_isin = flows[[\n",
" \"Registrar Account - ID\", \"Product - Isin\",\n",
" \"Centralisation Date\", \"Quantity - NetFlows\"\n",
"]].copy()\n",
"\n",
"flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
"\n",
"flows_isin = flows_isin.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
")[\"Quantity - NetFlows\"].sum().reset_index()\n",
"\n",
"# 3. Merge stocks & flows ISIN-by-ISIN\n",
"merged_isin = stocks_isin.merge(\n",
" flows_isin,\n",
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
" how=\"left\"\n",
")\n",
"\n",
"merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
"\n",
"# 4. Compute expected stock per ISIN for each account\n",
"merged_isin[\"prev_stock\"] = merged_isin.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - AUM\"].shift(1)\n",
"\n",
"merged_isin[\"prev_netflows\"] = merged_isin.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
"\n",
"merged_isin[\"expected_stock\"] = merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
"\n",
"# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
"TOL = 1e-6\n",
"merged_isin[\"gap\"] = merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
"merged_isin[\"rupture_flag\"] = (\n",
" merged_isin[\"prev_stock\"].notna()\n",
" & (merged_isin[\"gap\"].abs() > TOL)\n",
")\n",
"\n",
"# 6. Summarize ruptures per (Account, ISIN)\n",
"rupture_isin_summary = merged_isin.groupby(\n",
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
").agg(\n",
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
" obs=(\"rupture_flag\", \"count\"),\n",
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
").reset_index()\n",
"\n",
"# Sort by worst ISIN trajectories\n",
"rupture_isin_summary = rupture_isin_summary.sort_values(\n",
" \"rupture_ratio\", ascending=False\n",
")\n",
"\n",
"rupture_isin_summary.head(20)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "baa4b6cd-887d-45a6-af27-253a9aa8710f",
"metadata": {},
"outputs": [],
"source": [
"# Strong ruptures (ratio > 0.5 OR huge max_gap)\n",
"strong = rupture_isin_summary[\n",
" (rupture_isin_summary[\"rupture_ratio\"] > 0.5)\n",
" | (rupture_isin_summary[\"max_gap\"] > 50000)\n",
"]\n",
"\n",
"def find_successors(account_id, isin, window_days=15):\n",
" # Extract rupture dates for (account_id, isin)\n",
" ruptures = merged_isin[\n",
" (merged_isin[\"Registrar Account - ID\"] == account_id)\n",
" & (merged_isin[\"Product - Isin\"] == isin)\n",
" & (merged_isin[\"rupture_flag\"])\n",
" ][\"Centralisation Date\"].unique()\n",
"\n",
" if len(ruptures) == 0:\n",
" return []\n",
"\n",
" candidates = []\n",
"\n",
" for rupture_date in ruptures:\n",
" start = rupture_date - pd.Timedelta(days=window_days)\n",
" end = rupture_date + pd.Timedelta(days=window_days)\n",
"\n",
" # Look for accounts with strong positive jump at the same time\n",
" window_df = merged_isin[\n",
" (merged_isin[\"Centralisation Date\"] >= start)\n",
" & (merged_isin[\"Centralisation Date\"] <= end)\n",
" & (merged_isin[\"Product - Isin\"] == isin)\n",
" ]\n",
"\n",
" # Look for positive gap (jump)\n",
" pos_jumps = window_df[window_df[\"gap\"] > 0]\n",
"\n",
" candidates.extend(pos_jumps[\"Registrar Account - ID\"].unique())\n",
"\n",
" # Remove self\n",
" candidates = [c for c in candidates if c != account_id]\n",
"\n",
" return list(set(candidates))\n",
"\n",
"find_successors(\"200129601\", \"FR0010135103\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b834da2-f781-476d-84a6-aebb38fb8dac",
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"df = merged_isin.copy()\n",
"\n",
"# Ajouter année / mois\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# 1. Nombre total de lignes par mois\n",
"total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
"\n",
"# 2. Nombre de ruptures par mois\n",
"ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
"\n",
"# 3. Merge pour obtenir total + ruptures\n",
"ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
"ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
"\n",
"# 4. Proportion (en %)\n",
"ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
"\n",
"# 5. Pivot pour heatmap\n",
"heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
"\n",
"# 6. Plot\n",
"plt.figure(figsize=(14, 7))\n",
"sns.heatmap(\n",
" heatmap_ratio, \n",
" cmap=\"Reds\",\n",
" linewidths=.3,\n",
" linecolor=\"grey\",\n",
" annot=True,\n",
" fmt=\".2%\",\n",
" cbar_kws={'label': 'Proportion de ruptures'}\n",
")\n",
"\n",
"plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
"plt.xlabel(\"Mois\")\n",
"plt.ylabel(\"Année\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa5862ab-ec8e-47f8-8cb0-cd51503efed8",
"metadata": {},
"outputs": [],
"source": [
"df = merged_isin.copy()\n",
"\n",
"# Ajouter year / month au cas où\n",
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
"\n",
"# Merge géographique\n",
"df = df.merge(\n",
" geo[[\"Registrar Account - ID\", \"country\"]],\n",
" on=\"Registrar Account - ID\",\n",
" how=\"left\"\n",
")\n",
"\n",
"df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
"\n",
"# Total des lignes par pays\n",
"total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
"\n",
"# Nombre de ruptures\n",
"rupt_country = (\n",
" df[df[\"rupture_flag\"]]\n",
" .groupby(\"country\")\n",
" .size()\n",
" .reset_index(name=\"ruptures\")\n",
")\n",
"\n",
"# Merge + ratios\n",
"country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
"country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
"country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
"\n",
"# Tri (rupture ratio décroissant)\n",
"country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86d2a91c-d8d8-416c-8dc4-dc3f4ae7ca90",
"metadata": {},
"outputs": [],
"source": [
"# On ajoute une colonne en % pour laffichage\n",
"country_stats_plot = country_stats.copy()\n",
"country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
"\n",
"# Tri décroissant par proportion de ruptures\n",
"country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
"\n",
"fig = px.bar(\n",
" country_stats_plot,\n",
" x=\"country\",\n",
" y=\"rupture_ratio\",\n",
" hover_data={\n",
" \"rupture_pct\": ':.2f',\n",
" \"ruptures\": True,\n",
" \"total_obs\": True,\n",
" \"rupture_ratio\": False, # on cache la version décimale\n",
" },\n",
" labels={\n",
" \"country\": \"Pays\",\n",
" \"rupture_ratio\": \"Proportion de ruptures\",\n",
" \"rupture_pct\": \"% de ruptures\",\n",
" \"ruptures\": \"Nb de ruptures\",\n",
" \"total_obs\": \"Nb d'observations\"\n",
" },\n",
" title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
")\n",
"\n",
"# Format en %\n",
"fig.update_yaxes(tickformat=\".1%\")\n",
"\n",
"fig.update_layout(\n",
" xaxis_tickangle=-45,\n",
" bargap=0.2\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1c114db-5fbd-4cd3-a897-b9d4c96053fd",
"metadata": {},
"outputs": [],
"source": [
"df[df['country']=='JAPAN'].to_csv('Japan.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fec57f0-dd80-47bc-aacb-518c0ac0a4f6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "95bc353d-e883-4989-aaca-1b3c9b51ee5a",
"metadata": {},
"outputs": [],
"source": [
"rs = rupture_summary.copy()\n",
"\n",
"# 1. Stats numériques classiques\n",
"print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
"print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
"\n",
"\n",
"# 2. Distribution par classes (bins)\n",
"\n",
"rs[\"rupture_bucket\"] = pd.cut(\n",
" rs[\"rupture_ratio\"],\n",
" bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
" labels=[\n",
" \"00.1%\",\n",
" \"0.11%\",\n",
" \"15%\",\n",
" \"510%\",\n",
" \"1025%\",\n",
" \"2550%\",\n",
" \"50100%\"\n",
" ],\n",
" include_lowest=True\n",
")\n",
"\n",
"# Ajouter la catégorie \"0%\"\n",
"rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
"\n",
"# Remplacer les 0% exacts\n",
"rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
"\n",
"bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
"print(bucket_counts)\n",
"\n",
"\n",
"# 3. Pourcentages\n",
"bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
"\n",
"print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
"print(bucket_percent)\n",
"\n",
"\n",
"# 4. Nombre de comptes totalement propres\n",
"no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
"print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
"\n",
"# 5. Comptes extrêmement problématiques\n",
"severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
"print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
"\n",
"medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
"print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86d8fe0e-fa6c-46df-bb4c-054d1a677b38",
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"fig = px.histogram(\n",
" rs,\n",
" x=\"rupture_ratio\",\n",
" nbins=50,\n",
" title=\"Distribution du rupture_ratio\",\n",
" labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
")\n",
"fig.update_layout(bargap=0.05)\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "425b36d0-c92a-4405-be28-35b1fc292fec",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. Filtres de base ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# Filtrer uniquement l'année 2021\n",
"ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
"\n",
"print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
"\n",
"# --- 2. Classification du type de gap ---\n",
"ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
"\n",
"# --- 3. Statistiques globales ---\n",
"gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
"gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
"\n",
"print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
"print(gap_counts)\n",
"print(\"\\n(%)\")\n",
"print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
"\n",
"# --- 4. Intensité des écarts ---\n",
"intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
"print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
"print(intensity_stats)\n",
"\n",
"# --- 5. Visualisation rapide ---\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(10,5))\n",
"sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
"plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
"plt.title(\"Distribution des gaps de rupture en 2021\")\n",
"plt.xlabel(\"Gap (AUM_{t} Expected AUM_{t})\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df9e0005-93f2-4885-baef-2e54921a42f4",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. ADD YEAR ---\n",
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"# --- 2. DEFINE PERIODS ---\n",
"conditions = [\n",
" merged[\"year\"] < 2021,\n",
" merged[\"year\"] == 2021,\n",
" merged[\"year\"] > 2021\n",
"]\n",
"\n",
"period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
"\n",
"merged[\"period\"] = np.select(\n",
" conditions,\n",
" period_labels,\n",
" default=\"unknown\"\n",
")\n",
"\n",
"# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
"merged[\"gap_type\"] = np.where(\n",
" merged[\"gap\"] > 0, \"positive\",\n",
" np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 4. TOTAL OBS PER PERIOD ---\n",
"total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
"\n",
"# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
"rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
"\n",
"# --- 6. PROPORTION OF RUPTURES ---\n",
"rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
"\n",
"# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
"gap_dist = (\n",
" ruptures.groupby([\"period\", \"gap_type\"])\n",
" .size()\n",
" .groupby(level=0)\n",
" .apply(lambda x: (x / x.sum()) * 100) # % par période\n",
")\n",
"\n",
"\n",
"# --- 8. MERGE AND DISPLAY ---\n",
"summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
"summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
"\n",
"print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
"print(summary)\n",
"\n",
"print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
"print(gap_dist)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "195205bd-d817-41f9-a0fd-18d8b804515f",
"metadata": {},
"outputs": [],
"source": [
"# --- 1. DEFINE PERIODS ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 2. Ensure gap_type exists + no missing categories ---\n",
"ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"}) # zero is equivalent to no-flow change\n",
"\n",
"# --- 3. Compute gap counts ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 4. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 5. MAKE TWO DONUT CHARTS ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.45,\n",
" textinfo=\"label+percent\"\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
" showlegend=True\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9583f188-8601-425b-908f-61c2ee1f8da2",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Compute gap counts by period ---\n",
"gap_counts = (\n",
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"\n",
"# Ensure both columns exist\n",
"for col in [\"positive\", \"negative\"]:\n",
" if col not in gap_counts.columns:\n",
" gap_counts[col] = 0\n",
"\n",
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
"\n",
"# --- 2. Extract values ---\n",
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
"\n",
"# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
"fig = make_subplots(\n",
" rows=1, cols=2,\n",
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
" subplot_titles=(\"Before 2021\", \"After 2021\")\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=before_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=1\n",
")\n",
"\n",
"fig.add_trace(\n",
" go.Pie(\n",
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
" values=after_vals,\n",
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
" hole=0.35\n",
" ),\n",
" row=1, col=2\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4e29536-eeed-4b91-a59c-b373cf14a5fc",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- 1. Define periods ---\n",
"merged[\"period2\"] = np.where(\n",
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
" \"Before Sep 2021\",\n",
" \"After Sep 2021\"\n",
")\n",
"\n",
"# --- 2. Keep only ruptures ---\n",
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
"\n",
"# --- 3. Count ruptures per period ---\n",
"rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
" [\"Before Sep 2021\", \"After Sep 2021\"]\n",
").fillna(0)\n",
"\n",
"# --- 4. Pie chart ---\n",
"fig = go.Figure(data=[\n",
" go.Pie(\n",
" labels=rupture_counts.index,\n",
" values=rupture_counts.values,\n",
" hole=0.45,\n",
" marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
" textinfo=\"percent+value\",\n",
" )\n",
"])\n",
"\n",
"fig.update_layout(\n",
" title=\"Répartition des ruptures\"\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "553f91fa-5017-4685-ab31-afe2aa247e13",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# 1. Filtre sur la période post-Sept 2021\n",
"cutoff = pd.Timestamp(\"2021-09-01\")\n",
"post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
"\n",
"# 2. On ne garde que les ruptures\n",
"post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
"\n",
"# 3. Gap absolu + gap relatif (% du stock)\n",
"post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
"post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
"\n",
"# 4. Percentiles globaux\n",
"p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
"p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
"p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
"\n",
"# 5. Classification automatique\n",
"def classify_gap(gap, gap_rel, acct):\n",
" # RESET → énorme choc (technique)\n",
" if gap_abs >= p99 or gap_rel >= 0.90:\n",
" return \"reset\"\n",
"\n",
" # SPIKE → très gros gap mais isolé\n",
" if gap_abs >= p95:\n",
" return \"spike\"\n",
"\n",
" # SHIFT → décalage permanent\n",
" # Test : moyenne des gaps du compte\n",
" return None\n",
"\n",
"# Calcul du shift (décalage directionnel)\n",
"shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
"\n",
"post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
"\n",
"post_rupt[\"gap_type2\"] = np.where(\n",
" post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
" np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
" np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
" \n",
"# 6. Statistiques globales\n",
"stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
"print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
"print(stats)\n",
"\n",
"# 7. Stats par client\n",
"client_stats = (\n",
" post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"# 8. Stats par ISIN\n",
"isin_stats = (\n",
" post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
" .value_counts(normalize=True)\n",
" .rename(\"ratio\")\n",
" .mul(100)\n",
" .reset_index()\n",
")\n",
"\n",
"print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
"print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
"\n",
"print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
"print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4396f2c-a8b6-4ea9-8292-093f900bf260",
"metadata": {},
"outputs": [],
"source": [
"import plotly.graph_objects as go\n",
"\n",
"# --- Data from your output ---\n",
"labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
"values = [50.4, 44.6, 4.0, 1.0]\n",
"\n",
"# --- Pie chart ---\n",
"fig = go.Figure(\n",
" data=[go.Pie(\n",
" labels=labels,\n",
" values=values,\n",
" hole=0.35, # donut style (plus lisible)\n",
" textinfo='percent',\n",
" marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
" )]\n",
")\n",
"\n",
"fig.update_layout(\n",
" title=\"Typologie des ruptures depuis Septembre 2021\",\n",
" legend_title=\"Type de gap\",\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3df1f839-44d4-4894-bdfa-6851971d1983",
"metadata": {},
"outputs": [],
"source": [
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
"\n",
"yearly_stats = merged.groupby(\"year\").agg(\n",
" total_obs=(\"gap\", \"count\"),\n",
" ruptures=(\"rupture_flag\", \"sum\")\n",
").reset_index()\n",
"\n",
"yearly_stats[\"rupture_rate\"] = (\n",
" yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f312684d-0815-439c-a632-cadd1cbb779c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}