1392 lines
67 KiB
Plaintext
1392 lines
67 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "2e8cf88b-cecf-409f-9c2d-c3762b233f05",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n",
|
||
"Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n",
|
||
"\n",
|
||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
|
||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!pip install openpyxl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "ff2261fb-9516-4410-b42d-3acc8dc1a460",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n",
|
||
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n",
|
||
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n",
|
||
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
|
||
"fs = s3fs.S3FileSystem(\n",
|
||
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
||
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
||
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
||
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5",
|
||
"metadata": {},
|
||
"source": [
|
||
"# data exploration"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "eaf5c5a0-eb1c-4242-b893-7600e6def109",
|
||
"metadata": {},
|
||
"source": [
|
||
"Fonctions utiles"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "60e2035c-c2f0-4c51-97df-102e67ba96ee",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def plot_account(account_id, isin=None):\n",
|
||
" \"\"\"\n",
|
||
" Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n",
|
||
" Optionally, only for one ISIN.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" df = merged.copy()\n",
|
||
"\n",
|
||
" # Filter by account\n",
|
||
" df = df[df[\"Registrar Account - ID\"] == account_id]\n",
|
||
"\n",
|
||
" if isin is not None:\n",
|
||
" df = df[df[\"Product - Isin\"] == isin]\n",
|
||
"\n",
|
||
" if df.empty:\n",
|
||
" print(f\"No data found for account {account_id}\")\n",
|
||
" return\n",
|
||
"\n",
|
||
" df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n",
|
||
"\n",
|
||
" df_plot = df_plot.sort_values(\"Centralisation Date\")\n",
|
||
"\n",
|
||
" # Plot\n",
|
||
" plt.figure(figsize=(12, 4))\n",
|
||
" plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n",
|
||
" plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n",
|
||
" plt.xlabel(\"Date\")\n",
|
||
" plt.ylabel(\"Total AUM\")\n",
|
||
" plt.grid(True)\n",
|
||
" plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "37e008b1-32d4-44be-9d23-1b90a5a26f89",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 2. BASIC INSPECTION\n",
|
||
"\n",
|
||
"def quick_info(df, name):\n",
|
||
" print(\"\\n\" + \"=\"*80)\n",
|
||
" print(f\"DATASET : {name}\")\n",
|
||
" print(\"=\"*80)\n",
|
||
" print(\"\\nShape :\", df.shape)\n",
|
||
" print(\"\\nColumns :\", df.columns.tolist())\n",
|
||
" print(\"\\nDtypes :\\n\", df.dtypes)\n",
|
||
" print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n",
|
||
" print(\"\\nSample rows:\\n\", df.head(5))\n",
|
||
" print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "e104a416-4cfd-43b9-b9ec-6af1fce700da",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"os.environ[\"AWS_ACCESS_KEY_ID\"] = 'R9J6QLRZDYWLSWKBJ7IA'\n",
|
||
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'sFi4J78xigTJEXdY02bQL2i5KNwyYL7VZpMe0XJs'\n",
|
||
"os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJSOUo2UUxSWkRZV0xTV0tCSjdJQSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2NDM0Mzc0MSwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjU1NTM4NjYsImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2NDM0NDI2NiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiIxZWEzOTgwMi0zZGNiLTRlMzEtYTkwMS1jNGE4M2Q5ZjQyYzQiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjZlMDBhN2QxLTMxYWQtNGE3Ny04ZmE3LTBlYTc1Y2VhZTQwMCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.gUxGEq4iWA5eJVWYj-JyDZGYzTnWmQH92iYU-2_5P4n3erGFFZvz8wJe5keTAhcKKTycZKiWltnZpCNsRQ0vOg'\n",
|
||
"os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
|
||
"fs = s3fs.S3FileSystem(\n",
|
||
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
||
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
||
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
||
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "e67a99ea-ddf4-4627-8f48-ec183c671acb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "PermissionError",
|
||
"evalue": "Forbidden",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mClientError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:114\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/context.py:36\u001b[39m, in \u001b[36mwith_current_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m resolve_awaitable(hook())\n\u001b[32m---> \u001b[39m\u001b[32m36\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/client.py:424\u001b[39m, in \u001b[36mAioBaseClient._make_api_call\u001b[39m\u001b[34m(self, operation_name, api_params)\u001b[39m\n\u001b[32m 423\u001b[39m error_class = \u001b[38;5;28mself\u001b[39m.exceptions.from_code(error_code)\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[32m 425\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||
"\u001b[31mClientError\u001b[39m: An error occurred (403) when calling the HeadObject operation: Forbidden",
|
||
"\nThe above exception was the direct cause of the following exception:\n",
|
||
"\u001b[31mPermissionError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[33;03mwith fs.open(\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[33;03m \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 6\u001b[39m \u001b[33;03m flows = pd.read_csv(f, sep=\";\")\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mprojet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 10\u001b[39m nav_raw = pd.read_excel(f, header=\u001b[38;5;28;01mNone\u001b[39;00m, engine=\u001b[33m\"\u001b[39m\u001b[33mopenpyxl\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 11\u001b[39m nav = nav_raw[\u001b[32m0\u001b[39m].str.split(\u001b[33m\"\u001b[39m\u001b[33m,\u001b[39m\u001b[33m\"\u001b[39m, expand=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1338\u001b[39m, in \u001b[36mAbstractFileSystem.open\u001b[39m\u001b[34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[39m\n\u001b[32m 1336\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1337\u001b[39m ac = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33mautocommit\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._intrans)\n\u001b[32m-> \u001b[39m\u001b[32m1338\u001b[39m f = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1339\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1340\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1341\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1342\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1343\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1344\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1345\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1347\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfsspec\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcompression\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m compr\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:720\u001b[39m, in \u001b[36mS3FileSystem._open\u001b[39m\u001b[34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, size, requester_pays, cache_options, **kwargs)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 718\u001b[39m cache_type = \u001b[38;5;28mself\u001b[39m.default_cache_type\n\u001b[32m--> \u001b[39m\u001b[32m720\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 721\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 722\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 723\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 724\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 725\u001b[39m \u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[43m=\u001b[49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 726\u001b[39m \u001b[43m \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 727\u001b[39m \u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 728\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 729\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 730\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 731\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 732\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 733\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 734\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:2257\u001b[39m, in \u001b[36mS3File.__init__\u001b[39m\u001b[34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options, size)\u001b[39m\n\u001b[32m 2255\u001b[39m \u001b[38;5;28mself\u001b[39m.details = s3.info(path)\n\u001b[32m 2256\u001b[39m \u001b[38;5;28mself\u001b[39m.version_id = \u001b[38;5;28mself\u001b[39m.details.get(\u001b[33m\"\u001b[39m\u001b[33mVersionId\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2257\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 2258\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2259\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2260\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2261\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2262\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2263\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2264\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2265\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2266\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2267\u001b[39m \u001b[38;5;28mself\u001b[39m.s3 = \u001b[38;5;28mself\u001b[39m.fs \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[32m 2269\u001b[39m \u001b[38;5;66;03m# when not using autocommit we want to have transactional state to manage\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1912\u001b[39m, in \u001b[36mAbstractBufferedFile.__init__\u001b[39m\u001b[34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[39m\n\u001b[32m 1910\u001b[39m \u001b[38;5;28mself\u001b[39m.size = size\n\u001b[32m 1911\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1912\u001b[39m \u001b[38;5;28mself\u001b[39m.size = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdetails\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33msize\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1913\u001b[39m \u001b[38;5;28mself\u001b[39m.cache = caches[cache_type](\n\u001b[32m 1914\u001b[39m \u001b[38;5;28mself\u001b[39m.blocksize, \u001b[38;5;28mself\u001b[39m._fetch_range, \u001b[38;5;28mself\u001b[39m.size, **cache_options\n\u001b[32m 1915\u001b[39m )\n\u001b[32m 1916\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1925\u001b[39m, in \u001b[36mAbstractBufferedFile.details\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1922\u001b[39m \u001b[38;5;129m@property\u001b[39m\n\u001b[32m 1923\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 1924\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1925\u001b[39m \u001b[38;5;28mself\u001b[39m._details = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1926\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:118\u001b[39m, in \u001b[36msync_wrapper.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 115\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 116\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args, **kwargs):\n\u001b[32m 117\u001b[39m \u001b[38;5;28mself\u001b[39m = obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[32m0\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:103\u001b[39m, in \u001b[36msync\u001b[39m\u001b[34m(loop, func, timeout, *args, **kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m FSTimeoutError \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreturn_result\u001b[39;00m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[32m 104\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m return_result\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:56\u001b[39m, in \u001b[36m_runner\u001b[39m\u001b[34m(event, coro, result, timeout)\u001b[39m\n\u001b[32m 54\u001b[39m coro = asyncio.wait_for(coro, timeout=timeout)\n\u001b[32m 55\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m result[\u001b[32m0\u001b[39m] = \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m 58\u001b[39m result[\u001b[32m0\u001b[39m] = ex\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:1445\u001b[39m, in \u001b[36mS3FileSystem._info\u001b[39m\u001b[34m(self, path, bucket, key, refresh, version_id)\u001b[39m\n\u001b[32m 1443\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key:\n\u001b[32m 1444\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1445\u001b[39m out = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_s3(\n\u001b[32m 1446\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mhead_object\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1447\u001b[39m \u001b[38;5;28mself\u001b[39m.kwargs,\n\u001b[32m 1448\u001b[39m Bucket=bucket,\n\u001b[32m 1449\u001b[39m Key=key,\n\u001b[32m 1450\u001b[39m **version_id_kw(version_id),\n\u001b[32m 1451\u001b[39m **\u001b[38;5;28mself\u001b[39m.req_kw,\n\u001b[32m 1452\u001b[39m )\n\u001b[32m 1453\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m 1454\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1455\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m (...)\u001b[39m\u001b[32m 1461\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1462\u001b[39m }\n\u001b[32m 1463\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:371\u001b[39m, in \u001b[36mS3FileSystem._call_s3\u001b[39m\u001b[34m(self, method, *akwarglist, **kwargs)\u001b[39m\n\u001b[32m 369\u001b[39m logger.debug(\u001b[33m\"\u001b[39m\u001b[33mCALL: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, method.\u001b[34m__name__\u001b[39m, akwarglist, kw2)\n\u001b[32m 370\u001b[39m additional_kwargs = \u001b[38;5;28mself\u001b[39m._get_s3_method_kwargs(method, *akwarglist, **kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(\n\u001b[32m 372\u001b[39m method, kwargs=additional_kwargs, retries=\u001b[38;5;28mself\u001b[39m.retries\n\u001b[32m 373\u001b[39m )\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:146\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 144\u001b[39m err = e\n\u001b[32m 145\u001b[39m err = translate_boto_error(err)\n\u001b[32m--> \u001b[39m\u001b[32m146\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m err\n",
|
||
"\u001b[31mPermissionError\u001b[39m: Forbidden"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\"\"\"\n",
|
||
"with fs.open(\n",
|
||
" \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\n",
|
||
" \"rb\"\n",
|
||
") as f:\n",
|
||
" flows = pd.read_csv(f, sep=\";\")\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n",
|
||
" nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n",
|
||
"nav = nav_raw[0].str.split(\",\", expand=True)\n",
|
||
"nav.columns = nav.iloc[0]\n",
|
||
"with fs.open('projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
|
||
" stocks = pd.read_csv(f, sep=\";\")\n",
|
||
"\n",
|
||
"nav = nav[1:].reset_index(drop=True)\n",
|
||
"\n",
|
||
"quick_info(stocks, \"STOCKS\")\n",
|
||
"quick_info(flows, \"FLOWS\")\n",
|
||
"quick_info(nav, \"NAV/PRICES\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 1. CLEAN DATES (formats différents)\n",
|
||
"\n",
|
||
"stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n",
|
||
"flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n",
|
||
"nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n",
|
||
"\n",
|
||
"print(\"Date conversion done.\")\n",
|
||
"\n",
|
||
"# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n",
|
||
"\n",
|
||
"num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n",
|
||
" \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n",
|
||
"\n",
|
||
"for col in num_cols:\n",
|
||
" nav[col] = (\n",
|
||
" nav[col]\n",
|
||
" .astype(str)\n",
|
||
" .str.replace(\",\", \".\", regex=False)\n",
|
||
" .str.replace(\" \", \"\")\n",
|
||
" .astype(float)\n",
|
||
" )\n",
|
||
"\n",
|
||
"print(\"NAV numeric conversion done.\")\n",
|
||
"\n",
|
||
"# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n",
|
||
"\n",
|
||
"def norm(df):\n",
|
||
" for col in df.columns:\n",
|
||
" if df[col].dtype == \"object\":\n",
|
||
" df[col] = df[col].astype(str).str.strip().str.upper()\n",
|
||
" return df\n",
|
||
"\n",
|
||
"stocks = norm(stocks)\n",
|
||
"flows = norm(flows)\n",
|
||
"nav = norm(nav)\n",
|
||
"\n",
|
||
"print(\"String normalization done.\")\n",
|
||
"\n",
|
||
"\n",
|
||
"# 4. ANALYSE RELATIONS ACROSS FILES\n",
|
||
"\n",
|
||
"# Unique sets\n",
|
||
"isin_stocks = set(stocks[\"Product - Isin\"].unique())\n",
|
||
"isin_flows = set(flows[\"Product - Isin\"].unique())\n",
|
||
"isin_nav = set(nav[\"ShareClassIsin\"].unique())\n",
|
||
"\n",
|
||
"print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n",
|
||
"print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n",
|
||
"print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n",
|
||
"print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 5. CLIENTS: STOCKS VS FLOWS\n",
|
||
"\n",
|
||
"acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n",
|
||
"acc_flows = set(flows[\"Registrar Account - ID\"].unique())\n",
|
||
"\n",
|
||
"print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n",
|
||
"print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 6. CLIENT ACTIVITY METRICS (DETAILED)\n",
|
||
"\n",
|
||
"client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n",
|
||
" n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
|
||
" n_transactions=(\"Quantity - NetFlows\", \"count\"),\n",
|
||
" total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
|
||
" mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n",
|
||
" std_flow=(\"Quantity - NetFlows\", \"std\"),\n",
|
||
" total_subscription=(\"Quantity - Subscription\", \"sum\"),\n",
|
||
" total_redemption=(\"Quantity - Redemption\", \"sum\")\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"# Add churn metric\n",
|
||
"client_behavior[\"churn_ratio\"] = (\n",
|
||
" client_behavior[\"total_redemption\"] /\n",
|
||
" (client_behavior[\"total_subscription\"] + 1e-9)\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n",
|
||
"\n",
|
||
"\n",
|
||
"# 7. FUNDS ACTIVITY METRICS\n",
|
||
"\n",
|
||
"fund_behavior = flows.groupby(\"Product - Isin\").agg(\n",
|
||
" n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n",
|
||
" n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
|
||
" total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
|
||
" vol_flows=(\"Quantity - NetFlows\", \"std\")\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n",
|
||
"\n",
|
||
"\n",
|
||
"# 8. SAVE INTERMEDIATE\n",
|
||
"\n",
|
||
"client_behavior.to_csv(\"client_behavior.csv\", index=False)\n",
|
||
"fund_behavior.to_csv(\"fund_behavior.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"valid_full = isin_stocks & isin_flows & isin_nav\n",
|
||
"stocks_only = isin_stocks - isin_flows\n",
|
||
"flows_only = isin_flows - isin_stocks\n",
|
||
"missing_nav = (isin_stocks | isin_flows) - isin_nav\n",
|
||
"\n",
|
||
"print(\"FULL usable ISIN :\", len(valid_full))\n",
|
||
"print(\"Stocks only ISIN :\", len(stocks_only))\n",
|
||
"print(\"Flows only ISIN :\", len(flows_only))\n",
|
||
"print(\"Missing NAV :\", len(missing_nav))\n",
|
||
"\n",
|
||
"pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n",
|
||
"pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n",
|
||
"pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n",
|
||
"pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n",
|
||
"\n",
|
||
"print(\"All ISIN groups saved into 4 separate files.\")\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"eps = 1e-6\n",
|
||
"\n",
|
||
"client_behavior[\"churn_ratio\"] = (\n",
|
||
" client_behavior[\"total_redemption\"] /\n",
|
||
" (client_behavior[\"total_subscription\"] + eps)\n",
|
||
")\n",
|
||
"\n",
|
||
"client_behavior[\"churn_flag\"] = (\n",
|
||
" client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n",
|
||
").astype(int)\n",
|
||
"\n",
|
||
"client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n",
|
||
"\n",
|
||
"client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n",
|
||
"\n",
|
||
"client_behavior[\"inertia_ratio\"] = (\n",
|
||
" 1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n",
|
||
")\n",
|
||
"\n",
|
||
"print(client_behavior.head())\n",
|
||
"\n",
|
||
"client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8ee7e911-eb73-4846-b545-661140411c1b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Diversification per account\n",
|
||
"account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n",
|
||
" n_isin_held=(\"Product - Isin\", \"nunique\"),\n",
|
||
" n_funds_held=(\"Product - Fund\", \"nunique\"),\n",
|
||
" n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n",
|
||
" n_strategies=(\"Product - Strategy\", \"nunique\"),\n",
|
||
" total_aum=(\"Value - AUM €\", \"sum\"),\n",
|
||
" median_aum=(\"Value - AUM €\", \"median\")\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"# Concentration ratio per account\n",
|
||
"aum_by_account_fund = stocks.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Fund\"]\n",
|
||
")[\"Value - AUM €\"].sum().reset_index()\n",
|
||
"\n",
|
||
"concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n",
|
||
" .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n",
|
||
" .reset_index(name=\"concentration_ratio\")\n",
|
||
"\n",
|
||
"# Merge diversification + concentration\n",
|
||
"account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n",
|
||
"\n",
|
||
"print(account_static.head())\n",
|
||
"print(account_static.describe())\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "76f6fa0d-9d7a-4145-af1c-986d83947f91",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Geographic info per account\n",
|
||
"geo = stocks.groupby(\"Registrar Account - ID\").agg(\n",
|
||
" country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n",
|
||
" region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"print(geo.head())\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 1. Merge behavior (flows) with static diversification (stocks)\n",
|
||
"client_master = client_behavior.merge(\n",
|
||
" account_static,\n",
|
||
" on=\"Registrar Account - ID\",\n",
|
||
" how=\"left\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# 2. Add geographic info\n",
|
||
"client_master = client_master.merge(\n",
|
||
" geo,\n",
|
||
" on=\"Registrar Account - ID\",\n",
|
||
" how=\"left\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# 3. Create additional engineered features\n",
|
||
"client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n",
|
||
"client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 4. Replace NaN flow volatility with 0 (inactive accounts)\n",
|
||
"client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n",
|
||
"\n",
|
||
"# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n",
|
||
"client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n",
|
||
" client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n",
|
||
"\n",
|
||
"# 6. Fill missing geography as “UNKNOWN”\n",
|
||
"client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n",
|
||
"client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n",
|
||
"\n",
|
||
"# 7. Export\n",
|
||
"client_master.to_csv(\"client_master.csv\", index=False)\n",
|
||
"\n",
|
||
"print(client_master.head())\n",
|
||
"print(client_master.describe(include='all'))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839",
|
||
"metadata": {},
|
||
"source": [
|
||
"Détection des ruptures"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# --- 1. PREPARE STOCKS ---\n",
|
||
"stocks_clean = stocks[[\n",
|
||
" \"Registrar Account - ID\", \"Product - Isin\", \n",
|
||
" \"Centralisation Date\", \"Quantity - AUM\"\n",
|
||
"]].copy()\n",
|
||
"\n",
|
||
"stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
|
||
"stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n",
|
||
"\n",
|
||
"# --- 2. PREPARE FLOWS ---\n",
|
||
"flows_clean = flows[[\n",
|
||
" \"Registrar Account - ID\", \"Product - Isin\", \n",
|
||
" \"Centralisation Date\", \"Quantity - NetFlows\"\n",
|
||
"]].copy()\n",
|
||
"\n",
|
||
"flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
|
||
"\n",
|
||
"# Aggregate flows per day to avoid duplicates\n",
|
||
"flows_clean = flows_clean.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
|
||
")[\"Quantity - NetFlows\"].sum().reset_index()\n",
|
||
"\n",
|
||
"# --- 3. MERGE STOCKS WITH FLOWS ---\n",
|
||
"merged = stocks_clean.merge(\n",
|
||
" flows_clean,\n",
|
||
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
|
||
" how=\"left\"\n",
|
||
")\n",
|
||
"\n",
|
||
"merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n",
|
||
"\n",
|
||
"# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n",
|
||
"merged[\"prev_stock\"] = merged.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
|
||
")[\"Quantity - AUM\"].shift(1)\n",
|
||
"\n",
|
||
"# SHIFT NET FLOWS FROM PREVIOUS DATE\n",
|
||
"merged[\"prev_netflows\"] = merged.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
|
||
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
|
||
"\n",
|
||
"# Expected stock\n",
|
||
"merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n",
|
||
"\n",
|
||
"# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n",
|
||
"merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n",
|
||
"\n",
|
||
"# tolerance for numerical noise\n",
|
||
"TOL = 1e-6\n",
|
||
"merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n",
|
||
"\n",
|
||
"# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n",
|
||
"rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n",
|
||
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
|
||
" total_obs=(\"rupture_flag\", \"count\"),\n",
|
||
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
|
||
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"# Sort by biggest anomalies\n",
|
||
"rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n",
|
||
"\n",
|
||
"rupture_summary.head(10)\n",
|
||
"\n",
|
||
"rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n",
|
||
"rupture_summary_asc.to_csv('rupture.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"merged.to_csv('merged.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "71cd67aa-f4b9-489e-b928-defeca459cb6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"rupture_summary_asc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "72332a7e-0ab0-474b-aac7-b52ebbba7a8b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"plot_account('200001928')\n",
|
||
"plot_account('366351')\n",
|
||
"plot_account('365966')\n",
|
||
"plot_account('365568')\n",
|
||
"plot_account('200129601')\n",
|
||
"plot_account('402410')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "31407450-a833-4fce-8b0b-dba1b1de585f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 1. Prepare stock dataset ISIN-by-ISIN\n",
|
||
"stocks_isin = stocks[[\n",
|
||
" \"Registrar Account - ID\", \"Product - Isin\",\n",
|
||
" \"Centralisation Date\", \"Quantity - AUM\"\n",
|
||
"]].copy()\n",
|
||
"\n",
|
||
"stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
|
||
"stocks_isin = stocks_isin.sort_values(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
|
||
")\n",
|
||
"\n",
|
||
"# 2. Prepare flows dataset ISIN-by-ISIN\n",
|
||
"flows_isin = flows[[\n",
|
||
" \"Registrar Account - ID\", \"Product - Isin\",\n",
|
||
" \"Centralisation Date\", \"Quantity - NetFlows\"\n",
|
||
"]].copy()\n",
|
||
"\n",
|
||
"flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
|
||
"\n",
|
||
"flows_isin = flows_isin.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
|
||
")[\"Quantity - NetFlows\"].sum().reset_index()\n",
|
||
"\n",
|
||
"# 3. Merge stocks & flows ISIN-by-ISIN\n",
|
||
"merged_isin = stocks_isin.merge(\n",
|
||
" flows_isin,\n",
|
||
" on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
|
||
" how=\"left\"\n",
|
||
")\n",
|
||
"\n",
|
||
"merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
|
||
"\n",
|
||
"# 4. Compute expected stock per ISIN for each account\n",
|
||
"merged_isin[\"prev_stock\"] = merged_isin.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
|
||
")[\"Quantity - AUM\"].shift(1)\n",
|
||
"\n",
|
||
"merged_isin[\"prev_netflows\"] = merged_isin.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
|
||
")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
|
||
"\n",
|
||
"merged_isin[\"expected_stock\"] = merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
|
||
"\n",
|
||
"# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
|
||
"TOL = 1e-6\n",
|
||
"merged_isin[\"gap\"] = merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
|
||
"merged_isin[\"rupture_flag\"] = (\n",
|
||
" merged_isin[\"prev_stock\"].notna()\n",
|
||
" & (merged_isin[\"gap\"].abs() > TOL)\n",
|
||
")\n",
|
||
"\n",
|
||
"# 6. Summarize ruptures per (Account, ISIN)\n",
|
||
"rupture_isin_summary = merged_isin.groupby(\n",
|
||
" [\"Registrar Account - ID\", \"Product - Isin\"]\n",
|
||
").agg(\n",
|
||
" n_ruptures=(\"rupture_flag\", \"sum\"),\n",
|
||
" obs=(\"rupture_flag\", \"count\"),\n",
|
||
" rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
|
||
" max_gap=(\"gap\", lambda x: x.abs().max())\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"# Sort by worst ISIN trajectories\n",
|
||
"rupture_isin_summary = rupture_isin_summary.sort_values(\n",
|
||
" \"rupture_ratio\", ascending=False\n",
|
||
")\n",
|
||
"\n",
|
||
"rupture_isin_summary.head(20)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "baa4b6cd-887d-45a6-af27-253a9aa8710f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Strong ruptures (ratio > 0.5 OR huge max_gap)\n",
|
||
"strong = rupture_isin_summary[\n",
|
||
" (rupture_isin_summary[\"rupture_ratio\"] > 0.5)\n",
|
||
" | (rupture_isin_summary[\"max_gap\"] > 50000)\n",
|
||
"]\n",
|
||
"\n",
|
||
"def find_successors(account_id, isin, window_days=15):\n",
|
||
" # Extract rupture dates for (account_id, isin)\n",
|
||
" ruptures = merged_isin[\n",
|
||
" (merged_isin[\"Registrar Account - ID\"] == account_id)\n",
|
||
" & (merged_isin[\"Product - Isin\"] == isin)\n",
|
||
" & (merged_isin[\"rupture_flag\"])\n",
|
||
" ][\"Centralisation Date\"].unique()\n",
|
||
"\n",
|
||
" if len(ruptures) == 0:\n",
|
||
" return []\n",
|
||
"\n",
|
||
" candidates = []\n",
|
||
"\n",
|
||
" for rupture_date in ruptures:\n",
|
||
" start = rupture_date - pd.Timedelta(days=window_days)\n",
|
||
" end = rupture_date + pd.Timedelta(days=window_days)\n",
|
||
"\n",
|
||
" # Look for accounts with strong positive jump at the same time\n",
|
||
" window_df = merged_isin[\n",
|
||
" (merged_isin[\"Centralisation Date\"] >= start)\n",
|
||
" & (merged_isin[\"Centralisation Date\"] <= end)\n",
|
||
" & (merged_isin[\"Product - Isin\"] == isin)\n",
|
||
" ]\n",
|
||
"\n",
|
||
" # Look for positive gap (jump)\n",
|
||
" pos_jumps = window_df[window_df[\"gap\"] > 0]\n",
|
||
"\n",
|
||
" candidates.extend(pos_jumps[\"Registrar Account - ID\"].unique())\n",
|
||
"\n",
|
||
" # Remove self\n",
|
||
" candidates = [c for c in candidates if c != account_id]\n",
|
||
"\n",
|
||
" return list(set(candidates))\n",
|
||
"\n",
|
||
"find_successors(\"200129601\", \"FR0010135103\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "0b834da2-f781-476d-84a6-aebb38fb8dac",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"df = merged_isin.copy()\n",
|
||
"\n",
|
||
"# Ajouter année / mois\n",
|
||
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
|
||
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
|
||
"\n",
|
||
"# 1. Nombre total de lignes par mois\n",
|
||
"total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
|
||
"\n",
|
||
"# 2. Nombre de ruptures par mois\n",
|
||
"ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
|
||
"\n",
|
||
"# 3. Merge pour obtenir total + ruptures\n",
|
||
"ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
|
||
"ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
|
||
"\n",
|
||
"# 4. Proportion (en %)\n",
|
||
"ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
|
||
"\n",
|
||
"# 5. Pivot pour heatmap\n",
|
||
"heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
|
||
"\n",
|
||
"# 6. Plot\n",
|
||
"plt.figure(figsize=(14, 7))\n",
|
||
"sns.heatmap(\n",
|
||
" heatmap_ratio, \n",
|
||
" cmap=\"Reds\",\n",
|
||
" linewidths=.3,\n",
|
||
" linecolor=\"grey\",\n",
|
||
" annot=True,\n",
|
||
" fmt=\".2%\",\n",
|
||
" cbar_kws={'label': 'Proportion de ruptures'}\n",
|
||
")\n",
|
||
"\n",
|
||
"plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
|
||
"plt.xlabel(\"Mois\")\n",
|
||
"plt.ylabel(\"Année\")\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "aa5862ab-ec8e-47f8-8cb0-cd51503efed8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = merged_isin.copy()\n",
|
||
"\n",
|
||
"# Ajouter year / month au cas où\n",
|
||
"df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
|
||
"df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
|
||
"\n",
|
||
"# Merge géographique\n",
|
||
"df = df.merge(\n",
|
||
" geo[[\"Registrar Account - ID\", \"country\"]],\n",
|
||
" on=\"Registrar Account - ID\",\n",
|
||
" how=\"left\"\n",
|
||
")\n",
|
||
"\n",
|
||
"df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
|
||
"\n",
|
||
"# Total des lignes par pays\n",
|
||
"total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
|
||
"\n",
|
||
"# Nombre de ruptures\n",
|
||
"rupt_country = (\n",
|
||
" df[df[\"rupture_flag\"]]\n",
|
||
" .groupby(\"country\")\n",
|
||
" .size()\n",
|
||
" .reset_index(name=\"ruptures\")\n",
|
||
")\n",
|
||
"\n",
|
||
"# Merge + ratios\n",
|
||
"country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
|
||
"country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
|
||
"country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
|
||
"\n",
|
||
"# Tri (rupture ratio décroissant)\n",
|
||
"country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "86d2a91c-d8d8-416c-8dc4-dc3f4ae7ca90",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# On ajoute une colonne en % pour l’affichage\n",
|
||
"country_stats_plot = country_stats.copy()\n",
|
||
"country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
|
||
"\n",
|
||
"# Tri décroissant par proportion de ruptures\n",
|
||
"country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
|
||
"\n",
|
||
"fig = px.bar(\n",
|
||
" country_stats_plot,\n",
|
||
" x=\"country\",\n",
|
||
" y=\"rupture_ratio\",\n",
|
||
" hover_data={\n",
|
||
" \"rupture_pct\": ':.2f',\n",
|
||
" \"ruptures\": True,\n",
|
||
" \"total_obs\": True,\n",
|
||
" \"rupture_ratio\": False, # on cache la version décimale\n",
|
||
" },\n",
|
||
" labels={\n",
|
||
" \"country\": \"Pays\",\n",
|
||
" \"rupture_ratio\": \"Proportion de ruptures\",\n",
|
||
" \"rupture_pct\": \"% de ruptures\",\n",
|
||
" \"ruptures\": \"Nb de ruptures\",\n",
|
||
" \"total_obs\": \"Nb d'observations\"\n",
|
||
" },\n",
|
||
" title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# Format en %\n",
|
||
"fig.update_yaxes(tickformat=\".1%\")\n",
|
||
"\n",
|
||
"fig.update_layout(\n",
|
||
" xaxis_tickangle=-45,\n",
|
||
" bargap=0.2\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e1c114db-5fbd-4cd3-a897-b9d4c96053fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df[df['country']=='JAPAN'].to_csv('Japan.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9fec57f0-dd80-47bc-aacb-518c0ac0a4f6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "95bc353d-e883-4989-aaca-1b3c9b51ee5a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"rs = rupture_summary.copy()\n",
|
||
"\n",
|
||
"# 1. Stats numériques classiques\n",
|
||
"print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
|
||
"print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 2. Distribution par classes (bins)\n",
|
||
"\n",
|
||
"rs[\"rupture_bucket\"] = pd.cut(\n",
|
||
" rs[\"rupture_ratio\"],\n",
|
||
" bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
|
||
" labels=[\n",
|
||
" \"0–0.1%\",\n",
|
||
" \"0.1–1%\",\n",
|
||
" \"1–5%\",\n",
|
||
" \"5–10%\",\n",
|
||
" \"10–25%\",\n",
|
||
" \"25–50%\",\n",
|
||
" \"50–100%\"\n",
|
||
" ],\n",
|
||
" include_lowest=True\n",
|
||
")\n",
|
||
"\n",
|
||
"# Ajouter la catégorie \"0%\"\n",
|
||
"rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
|
||
"\n",
|
||
"# Remplacer les 0% exacts\n",
|
||
"rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
|
||
"\n",
|
||
"bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
|
||
"print(bucket_counts)\n",
|
||
"\n",
|
||
"\n",
|
||
"# 3. Pourcentages\n",
|
||
"bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
|
||
"\n",
|
||
"print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
|
||
"print(bucket_percent)\n",
|
||
"\n",
|
||
"\n",
|
||
"# 4. Nombre de comptes totalement propres\n",
|
||
"no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
|
||
"print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
|
||
"\n",
|
||
"# 5. Comptes extrêmement problématiques\n",
|
||
"severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
|
||
"print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
|
||
"\n",
|
||
"medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
|
||
"print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "86d8fe0e-fa6c-46df-bb4c-054d1a677b38",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import plotly.express as px\n",
|
||
"\n",
|
||
"fig = px.histogram(\n",
|
||
" rs,\n",
|
||
" x=\"rupture_ratio\",\n",
|
||
" nbins=50,\n",
|
||
" title=\"Distribution du rupture_ratio\",\n",
|
||
" labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
|
||
")\n",
|
||
"fig.update_layout(bargap=0.05)\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "425b36d0-c92a-4405-be28-35b1fc292fec",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# --- 1. Filtres de base ---\n",
|
||
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
|
||
"\n",
|
||
"# Filtrer uniquement l'année 2021\n",
|
||
"ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
|
||
"\n",
|
||
"print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
|
||
"\n",
|
||
"# --- 2. Classification du type de gap ---\n",
|
||
"ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
|
||
"\n",
|
||
"# --- 3. Statistiques globales ---\n",
|
||
"gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
|
||
"gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
|
||
"\n",
|
||
"print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
|
||
"print(gap_counts)\n",
|
||
"print(\"\\n(%)\")\n",
|
||
"print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
|
||
"\n",
|
||
"# --- 4. Intensité des écarts ---\n",
|
||
"intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
|
||
"print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
|
||
"print(intensity_stats)\n",
|
||
"\n",
|
||
"# --- 5. Visualisation rapide ---\n",
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"plt.figure(figsize=(10,5))\n",
|
||
"sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
|
||
"plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
|
||
"plt.title(\"Distribution des gaps de rupture en 2021\")\n",
|
||
"plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "df9e0005-93f2-4885-baef-2e54921a42f4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# --- 1. ADD YEAR ---\n",
|
||
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
|
||
"\n",
|
||
"# --- 2. DEFINE PERIODS ---\n",
|
||
"conditions = [\n",
|
||
" merged[\"year\"] < 2021,\n",
|
||
" merged[\"year\"] == 2021,\n",
|
||
" merged[\"year\"] > 2021\n",
|
||
"]\n",
|
||
"\n",
|
||
"period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
|
||
"\n",
|
||
"merged[\"period\"] = np.select(\n",
|
||
" conditions,\n",
|
||
" period_labels,\n",
|
||
" default=\"unknown\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
|
||
"merged[\"gap_type\"] = np.where(\n",
|
||
" merged[\"gap\"] > 0, \"positive\",\n",
|
||
" np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
|
||
")\n",
|
||
"\n",
|
||
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
|
||
"\n",
|
||
"# --- 4. TOTAL OBS PER PERIOD ---\n",
|
||
"total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
|
||
"\n",
|
||
"# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
|
||
"rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
|
||
"\n",
|
||
"# --- 6. PROPORTION OF RUPTURES ---\n",
|
||
"rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
|
||
"\n",
|
||
"# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
|
||
"gap_dist = (\n",
|
||
" ruptures.groupby([\"period\", \"gap_type\"])\n",
|
||
" .size()\n",
|
||
" .groupby(level=0)\n",
|
||
" .apply(lambda x: (x / x.sum()) * 100) # % par période\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"# --- 8. MERGE AND DISPLAY ---\n",
|
||
"summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
|
||
"summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
|
||
"\n",
|
||
"print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
|
||
"print(summary)\n",
|
||
"\n",
|
||
"print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
|
||
"print(gap_dist)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "195205bd-d817-41f9-a0fd-18d8b804515f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# --- 1. DEFINE PERIODS ---\n",
|
||
"merged[\"period2\"] = np.where(\n",
|
||
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
|
||
" \"Before Sep 2021\",\n",
|
||
" \"After Sep 2021\"\n",
|
||
")\n",
|
||
"\n",
|
||
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
|
||
"\n",
|
||
"# --- 2. Ensure gap_type exists + no missing categories ---\n",
|
||
"ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"}) # zero is equivalent to no-flow change\n",
|
||
"\n",
|
||
"# --- 3. Compute gap counts ---\n",
|
||
"gap_counts = (\n",
|
||
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
|
||
" .size()\n",
|
||
" .unstack(fill_value=0)\n",
|
||
")\n",
|
||
"\n",
|
||
"# Ensure both columns exist\n",
|
||
"for col in [\"positive\", \"negative\"]:\n",
|
||
" if col not in gap_counts.columns:\n",
|
||
" gap_counts[col] = 0\n",
|
||
"\n",
|
||
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
|
||
"\n",
|
||
"# --- 4. Extract values ---\n",
|
||
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
|
||
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
|
||
"\n",
|
||
"# --- 5. MAKE TWO DONUT CHARTS ---\n",
|
||
"fig = make_subplots(\n",
|
||
" rows=1, cols=2,\n",
|
||
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
|
||
" subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.add_trace(\n",
|
||
" go.Pie(\n",
|
||
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
|
||
" values=before_vals,\n",
|
||
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
|
||
" hole=0.45,\n",
|
||
" textinfo=\"label+percent\"\n",
|
||
" ),\n",
|
||
" row=1, col=1\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.add_trace(\n",
|
||
" go.Pie(\n",
|
||
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
|
||
" values=after_vals,\n",
|
||
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
|
||
" hole=0.45,\n",
|
||
" textinfo=\"label+percent\"\n",
|
||
" ),\n",
|
||
" row=1, col=2\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.update_layout(\n",
|
||
" title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
|
||
" showlegend=True\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9583f188-8601-425b-908f-61c2ee1f8da2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import plotly.graph_objects as go\n",
|
||
"\n",
|
||
"# --- 1. Compute gap counts by period ---\n",
|
||
"gap_counts = (\n",
|
||
" ruptures.groupby([\"period2\", \"gap_type\"])\n",
|
||
" .size()\n",
|
||
" .unstack(fill_value=0)\n",
|
||
")\n",
|
||
"\n",
|
||
"# Ensure both columns exist\n",
|
||
"for col in [\"positive\", \"negative\"]:\n",
|
||
" if col not in gap_counts.columns:\n",
|
||
" gap_counts[col] = 0\n",
|
||
"\n",
|
||
"gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
|
||
"\n",
|
||
"# --- 2. Extract values ---\n",
|
||
"before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
|
||
"after_vals = gap_counts.loc[\"After Sep 2021\"].values\n",
|
||
"\n",
|
||
"# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
|
||
"fig = make_subplots(\n",
|
||
" rows=1, cols=2,\n",
|
||
" specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
|
||
" subplot_titles=(\"Before 2021\", \"After 2021\")\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.add_trace(\n",
|
||
" go.Pie(\n",
|
||
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
|
||
" values=before_vals,\n",
|
||
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
|
||
" hole=0.35\n",
|
||
" ),\n",
|
||
" row=1, col=1\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.add_trace(\n",
|
||
" go.Pie(\n",
|
||
" labels=[\"Negative gaps\", \"Positive gaps\"],\n",
|
||
" values=after_vals,\n",
|
||
" marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
|
||
" hole=0.35\n",
|
||
" ),\n",
|
||
" row=1, col=2\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.update_layout(\n",
|
||
" title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f4e29536-eeed-4b91-a59c-b373cf14a5fc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import plotly.graph_objects as go\n",
|
||
"\n",
|
||
"# --- 1. Define periods ---\n",
|
||
"merged[\"period2\"] = np.where(\n",
|
||
" merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
|
||
" \"Before Sep 2021\",\n",
|
||
" \"After Sep 2021\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# --- 2. Keep only ruptures ---\n",
|
||
"ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
|
||
"\n",
|
||
"# --- 3. Count ruptures per period ---\n",
|
||
"rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
|
||
" [\"Before Sep 2021\", \"After Sep 2021\"]\n",
|
||
").fillna(0)\n",
|
||
"\n",
|
||
"# --- 4. Pie chart ---\n",
|
||
"fig = go.Figure(data=[\n",
|
||
" go.Pie(\n",
|
||
" labels=rupture_counts.index,\n",
|
||
" values=rupture_counts.values,\n",
|
||
" hole=0.45,\n",
|
||
" marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
|
||
" textinfo=\"percent+value\",\n",
|
||
" )\n",
|
||
"])\n",
|
||
"\n",
|
||
"fig.update_layout(\n",
|
||
" title=\"Répartition des ruptures\"\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "553f91fa-5017-4685-ab31-afe2aa247e13",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# 1. Filtre sur la période post-Sept 2021\n",
|
||
"cutoff = pd.Timestamp(\"2021-09-01\")\n",
|
||
"post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
|
||
"\n",
|
||
"# 2. On ne garde que les ruptures\n",
|
||
"post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
|
||
"\n",
|
||
"# 3. Gap absolu + gap relatif (% du stock)\n",
|
||
"post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
|
||
"post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
|
||
"\n",
|
||
"# 4. Percentiles globaux\n",
|
||
"p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
|
||
"p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
|
||
"p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
|
||
"\n",
|
||
"# 5. Classification automatique\n",
|
||
"def classify_gap(gap, gap_rel, acct):\n",
|
||
" # RESET → énorme choc (technique)\n",
|
||
" if gap_abs >= p99 or gap_rel >= 0.90:\n",
|
||
" return \"reset\"\n",
|
||
"\n",
|
||
" # SPIKE → très gros gap mais isolé\n",
|
||
" if gap_abs >= p95:\n",
|
||
" return \"spike\"\n",
|
||
"\n",
|
||
" # SHIFT → décalage permanent\n",
|
||
" # Test : moyenne des gaps du compte\n",
|
||
" return None\n",
|
||
"\n",
|
||
"# Calcul du shift (décalage directionnel)\n",
|
||
"shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
|
||
"\n",
|
||
"post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
|
||
"\n",
|
||
"post_rupt[\"gap_type2\"] = np.where(\n",
|
||
" post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
|
||
" np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
|
||
" np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
|
||
" \n",
|
||
"# 6. Statistiques globales\n",
|
||
"stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
|
||
"print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
|
||
"print(stats)\n",
|
||
"\n",
|
||
"# 7. Stats par client\n",
|
||
"client_stats = (\n",
|
||
" post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
|
||
" .value_counts(normalize=True)\n",
|
||
" .rename(\"ratio\")\n",
|
||
" .mul(100)\n",
|
||
" .reset_index()\n",
|
||
")\n",
|
||
"\n",
|
||
"# 8. Stats par ISIN\n",
|
||
"isin_stats = (\n",
|
||
" post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
|
||
" .value_counts(normalize=True)\n",
|
||
" .rename(\"ratio\")\n",
|
||
" .mul(100)\n",
|
||
" .reset_index()\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
|
||
"print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
|
||
"\n",
|
||
"print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
|
||
"print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f4396f2c-a8b6-4ea9-8292-093f900bf260",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import plotly.graph_objects as go\n",
|
||
"\n",
|
||
"# --- Data from your output ---\n",
|
||
"labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
|
||
"values = [50.4, 44.6, 4.0, 1.0]\n",
|
||
"\n",
|
||
"# --- Pie chart ---\n",
|
||
"fig = go.Figure(\n",
|
||
" data=[go.Pie(\n",
|
||
" labels=labels,\n",
|
||
" values=values,\n",
|
||
" hole=0.35, # donut style (plus lisible)\n",
|
||
" textinfo='percent',\n",
|
||
" marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
|
||
" )]\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.update_layout(\n",
|
||
" title=\"Typologie des ruptures depuis Septembre 2021\",\n",
|
||
" legend_title=\"Type de gap\",\n",
|
||
")\n",
|
||
"\n",
|
||
"fig.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "3df1f839-44d4-4894-bdfa-6851971d1983",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
|
||
"\n",
|
||
"yearly_stats = merged.groupby(\"year\").agg(\n",
|
||
" total_obs=(\"gap\", \"count\"),\n",
|
||
" ruptures=(\"rupture_flag\", \"sum\")\n",
|
||
").reset_index()\n",
|
||
"\n",
|
||
"yearly_stats[\"rupture_rate\"] = (\n",
|
||
" yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
|
||
")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f312684d-0815-439c-a632-cadd1cbb779c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.8"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|