{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2e8cf88b-cecf-409f-9c2d-c3762b233f05", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n", "Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install openpyxl" ] }, { "cell_type": "code", "execution_count": 2, "id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "id": "ff2261fb-9516-4410-b42d-3acc8dc1a460", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n", "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n", "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n", "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n", "fs = s3fs.S3FileSystem(\n", " client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n", " key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n", " secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n", " token = os.environ[\"AWS_SESSION_TOKEN\"])" ] }, { "cell_type": "markdown", "id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5", "metadata": {}, "source": [ "# data exploration" ] }, { "cell_type": "markdown", "id": "eaf5c5a0-eb1c-4242-b893-7600e6def109", "metadata": {}, "source": [ "Fonctions utiles" ] }, { "cell_type": "code", "execution_count": 4, "id": "60e2035c-c2f0-4c51-97df-102e67ba96ee", "metadata": {}, "outputs": [], "source": [ "def plot_account(account_id, isin=None):\n", " \"\"\"\n", " Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n", " Optionally, only for one ISIN.\n", " \"\"\"\n", "\n", " df = merged.copy()\n", "\n", " # Filter by account\n", " df = df[df[\"Registrar Account - ID\"] == account_id]\n", "\n", " if isin is not None:\n", " df = df[df[\"Product - Isin\"] == isin]\n", "\n", " if df.empty:\n", " print(f\"No data found for account {account_id}\")\n", " return\n", "\n", " df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n", "\n", " df_plot = df_plot.sort_values(\"Centralisation Date\")\n", "\n", " # Plot\n", " plt.figure(figsize=(12, 4))\n", " plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n", " plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n", " plt.xlabel(\"Date\")\n", " plt.ylabel(\"Total AUM\")\n", " plt.grid(True)\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "37e008b1-32d4-44be-9d23-1b90a5a26f89", "metadata": {}, "outputs": [], "source": [ "# 2. BASIC INSPECTION\n", "\n", "def quick_info(df, name):\n", " print(\"\\n\" + \"=\"*80)\n", " print(f\"DATASET : {name}\")\n", " print(\"=\"*80)\n", " print(\"\\nShape :\", df.shape)\n", " print(\"\\nColumns :\", df.columns.tolist())\n", " print(\"\\nDtypes :\\n\", df.dtypes)\n", " print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n", " print(\"\\nSample rows:\\n\", df.head(5))\n", " print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": 6, "id": "e104a416-4cfd-43b9-b9ec-6af1fce700da", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'R9J6QLRZDYWLSWKBJ7IA'\n", "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'sFi4J78xigTJEXdY02bQL2i5KNwyYL7VZpMe0XJs'\n", "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJSOUo2UUxSWkRZV0xTV0tCSjdJQSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2NDM0Mzc0MSwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjU1NTM4NjYsImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2NDM0NDI2NiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiIxZWEzOTgwMi0zZGNiLTRlMzEtYTkwMS1jNGE4M2Q5ZjQyYzQiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjZlMDBhN2QxLTMxYWQtNGE3Ny04ZmE3LTBlYTc1Y2VhZTQwMCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.gUxGEq4iWA5eJVWYj-JyDZGYzTnWmQH92iYU-2_5P4n3erGFFZvz8wJe5keTAhcKKTycZKiWltnZpCNsRQ0vOg'\n", "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n", "fs = s3fs.S3FileSystem(\n", " client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n", " key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n", " secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n", " token = os.environ[\"AWS_SESSION_TOKEN\"])" ] }, { "cell_type": "code", "execution_count": 8, "id": "e67a99ea-ddf4-4627-8f48-ec183c671acb", "metadata": {}, "outputs": [ { "ename": "PermissionError", "evalue": "Forbidden", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mClientError\u001b[39m Traceback (most recent call last)", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:114\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/context.py:36\u001b[39m, in \u001b[36mwith_current_context..decorator..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m resolve_awaitable(hook())\n\u001b[32m---> \u001b[39m\u001b[32m36\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/client.py:424\u001b[39m, in \u001b[36mAioBaseClient._make_api_call\u001b[39m\u001b[34m(self, operation_name, api_params)\u001b[39m\n\u001b[32m 423\u001b[39m error_class = \u001b[38;5;28mself\u001b[39m.exceptions.from_code(error_code)\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[32m 425\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[31mClientError\u001b[39m: An error occurred (403) when calling the HeadObject operation: Forbidden", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[31mPermissionError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[33;03mwith fs.open(\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[33;03m \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 6\u001b[39m \u001b[33;03m flows = pd.read_csv(f, sep=\";\")\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mprojet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 10\u001b[39m nav_raw = pd.read_excel(f, header=\u001b[38;5;28;01mNone\u001b[39;00m, engine=\u001b[33m\"\u001b[39m\u001b[33mopenpyxl\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 11\u001b[39m nav = nav_raw[\u001b[32m0\u001b[39m].str.split(\u001b[33m\"\u001b[39m\u001b[33m,\u001b[39m\u001b[33m\"\u001b[39m, expand=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1338\u001b[39m, in \u001b[36mAbstractFileSystem.open\u001b[39m\u001b[34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[39m\n\u001b[32m 1336\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1337\u001b[39m ac = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33mautocommit\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._intrans)\n\u001b[32m-> \u001b[39m\u001b[32m1338\u001b[39m f = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1339\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1340\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1341\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1342\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1343\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1344\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1345\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1347\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfsspec\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcompression\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m compr\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:720\u001b[39m, in \u001b[36mS3FileSystem._open\u001b[39m\u001b[34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, size, requester_pays, cache_options, **kwargs)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 718\u001b[39m cache_type = \u001b[38;5;28mself\u001b[39m.default_cache_type\n\u001b[32m--> \u001b[39m\u001b[32m720\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 721\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 722\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 723\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 724\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 725\u001b[39m \u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[43m=\u001b[49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 726\u001b[39m \u001b[43m \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 727\u001b[39m \u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 728\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 729\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 730\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 731\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 732\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 733\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 734\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:2257\u001b[39m, in \u001b[36mS3File.__init__\u001b[39m\u001b[34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options, size)\u001b[39m\n\u001b[32m 2255\u001b[39m \u001b[38;5;28mself\u001b[39m.details = s3.info(path)\n\u001b[32m 2256\u001b[39m \u001b[38;5;28mself\u001b[39m.version_id = \u001b[38;5;28mself\u001b[39m.details.get(\u001b[33m\"\u001b[39m\u001b[33mVersionId\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2257\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 2258\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2259\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2260\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2261\u001b[39m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2262\u001b[39m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2263\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2264\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2265\u001b[39m \u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2266\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2267\u001b[39m \u001b[38;5;28mself\u001b[39m.s3 = \u001b[38;5;28mself\u001b[39m.fs \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[32m 2269\u001b[39m \u001b[38;5;66;03m# when not using autocommit we want to have transactional state to manage\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1912\u001b[39m, in \u001b[36mAbstractBufferedFile.__init__\u001b[39m\u001b[34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[39m\n\u001b[32m 1910\u001b[39m \u001b[38;5;28mself\u001b[39m.size = size\n\u001b[32m 1911\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1912\u001b[39m \u001b[38;5;28mself\u001b[39m.size = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdetails\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33msize\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1913\u001b[39m \u001b[38;5;28mself\u001b[39m.cache = caches[cache_type](\n\u001b[32m 1914\u001b[39m \u001b[38;5;28mself\u001b[39m.blocksize, \u001b[38;5;28mself\u001b[39m._fetch_range, \u001b[38;5;28mself\u001b[39m.size, **cache_options\n\u001b[32m 1915\u001b[39m )\n\u001b[32m 1916\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1925\u001b[39m, in \u001b[36mAbstractBufferedFile.details\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1922\u001b[39m \u001b[38;5;129m@property\u001b[39m\n\u001b[32m 1923\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 1924\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1925\u001b[39m \u001b[38;5;28mself\u001b[39m._details = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1926\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:118\u001b[39m, in \u001b[36msync_wrapper..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 115\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 116\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args, **kwargs):\n\u001b[32m 117\u001b[39m \u001b[38;5;28mself\u001b[39m = obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[32m0\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:103\u001b[39m, in \u001b[36msync\u001b[39m\u001b[34m(loop, func, timeout, *args, **kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m FSTimeoutError \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreturn_result\u001b[39;00m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[32m 104\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m return_result\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:56\u001b[39m, in \u001b[36m_runner\u001b[39m\u001b[34m(event, coro, result, timeout)\u001b[39m\n\u001b[32m 54\u001b[39m coro = asyncio.wait_for(coro, timeout=timeout)\n\u001b[32m 55\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m result[\u001b[32m0\u001b[39m] = \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m 58\u001b[39m result[\u001b[32m0\u001b[39m] = ex\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:1445\u001b[39m, in \u001b[36mS3FileSystem._info\u001b[39m\u001b[34m(self, path, bucket, key, refresh, version_id)\u001b[39m\n\u001b[32m 1443\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key:\n\u001b[32m 1444\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1445\u001b[39m out = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_s3(\n\u001b[32m 1446\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mhead_object\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1447\u001b[39m \u001b[38;5;28mself\u001b[39m.kwargs,\n\u001b[32m 1448\u001b[39m Bucket=bucket,\n\u001b[32m 1449\u001b[39m Key=key,\n\u001b[32m 1450\u001b[39m **version_id_kw(version_id),\n\u001b[32m 1451\u001b[39m **\u001b[38;5;28mself\u001b[39m.req_kw,\n\u001b[32m 1452\u001b[39m )\n\u001b[32m 1453\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m 1454\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1455\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m (...)\u001b[39m\u001b[32m 1461\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m 1462\u001b[39m }\n\u001b[32m 1463\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:371\u001b[39m, in \u001b[36mS3FileSystem._call_s3\u001b[39m\u001b[34m(self, method, *akwarglist, **kwargs)\u001b[39m\n\u001b[32m 369\u001b[39m logger.debug(\u001b[33m\"\u001b[39m\u001b[33mCALL: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, method.\u001b[34m__name__\u001b[39m, akwarglist, kw2)\n\u001b[32m 370\u001b[39m additional_kwargs = \u001b[38;5;28mself\u001b[39m._get_s3_method_kwargs(method, *akwarglist, **kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(\n\u001b[32m 372\u001b[39m method, kwargs=additional_kwargs, retries=\u001b[38;5;28mself\u001b[39m.retries\n\u001b[32m 373\u001b[39m )\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:146\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m 144\u001b[39m err = e\n\u001b[32m 145\u001b[39m err = translate_boto_error(err)\n\u001b[32m--> \u001b[39m\u001b[32m146\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m err\n", "\u001b[31mPermissionError\u001b[39m: Forbidden" ] } ], "source": [ "\"\"\"\n", "with fs.open(\n", " \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\n", " \"rb\"\n", ") as f:\n", " flows = pd.read_csv(f, sep=\";\")\n", "\"\"\"\n", "\n", "with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n", " nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n", "nav = nav_raw[0].str.split(\",\", expand=True)\n", "nav.columns = nav.iloc[0]\n", "with fs.open('projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n", " stocks = pd.read_csv(f, sep=\";\")\n", "\n", "nav = nav[1:].reset_index(drop=True)\n", "\n", "quick_info(stocks, \"STOCKS\")\n", "quick_info(flows, \"FLOWS\")\n", "quick_info(nav, \"NAV/PRICES\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6", "metadata": {}, "outputs": [], "source": [ "# 1. CLEAN DATES (formats différents)\n", "\n", "stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n", "flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n", "nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n", "\n", "print(\"Date conversion done.\")\n", "\n", "# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n", "\n", "num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n", " \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n", "\n", "for col in num_cols:\n", " nav[col] = (\n", " nav[col]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .str.replace(\" \", \"\")\n", " .astype(float)\n", " )\n", "\n", "print(\"NAV numeric conversion done.\")\n", "\n", "# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n", "\n", "def norm(df):\n", " for col in df.columns:\n", " if df[col].dtype == \"object\":\n", " df[col] = df[col].astype(str).str.strip().str.upper()\n", " return df\n", "\n", "stocks = norm(stocks)\n", "flows = norm(flows)\n", "nav = norm(nav)\n", "\n", "print(\"String normalization done.\")\n", "\n", "\n", "# 4. ANALYSE RELATIONS ACROSS FILES\n", "\n", "# Unique sets\n", "isin_stocks = set(stocks[\"Product - Isin\"].unique())\n", "isin_flows = set(flows[\"Product - Isin\"].unique())\n", "isin_nav = set(nav[\"ShareClassIsin\"].unique())\n", "\n", "print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n", "print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n", "print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n", "print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n", "\n", "\n", "# 5. CLIENTS: STOCKS VS FLOWS\n", "\n", "acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n", "acc_flows = set(flows[\"Registrar Account - ID\"].unique())\n", "\n", "print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n", "print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n", "\n", "\n", "# 6. CLIENT ACTIVITY METRICS (DETAILED)\n", "\n", "client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " n_transactions=(\"Quantity - NetFlows\", \"count\"),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n", " std_flow=(\"Quantity - NetFlows\", \"std\"),\n", " total_subscription=(\"Quantity - Subscription\", \"sum\"),\n", " total_redemption=(\"Quantity - Redemption\", \"sum\")\n", ").reset_index()\n", "\n", "# Add churn metric\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + 1e-9)\n", ")\n", "\n", "print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n", "\n", "\n", "# 7. FUNDS ACTIVITY METRICS\n", "\n", "fund_behavior = flows.groupby(\"Product - Isin\").agg(\n", " n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " vol_flows=(\"Quantity - NetFlows\", \"std\")\n", ").reset_index()\n", "\n", "print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n", "\n", "\n", "# 8. SAVE INTERMEDIATE\n", "\n", "client_behavior.to_csv(\"client_behavior.csv\", index=False)\n", "fund_behavior.to_csv(\"fund_behavior.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a", "metadata": {}, "outputs": [], "source": [ "valid_full = isin_stocks & isin_flows & isin_nav\n", "stocks_only = isin_stocks - isin_flows\n", "flows_only = isin_flows - isin_stocks\n", "missing_nav = (isin_stocks | isin_flows) - isin_nav\n", "\n", "print(\"FULL usable ISIN :\", len(valid_full))\n", "print(\"Stocks only ISIN :\", len(stocks_only))\n", "print(\"Flows only ISIN :\", len(flows_only))\n", "print(\"Missing NAV :\", len(missing_nav))\n", "\n", "pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n", "\n", "print(\"All ISIN groups saved into 4 separate files.\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a", "metadata": {}, "outputs": [], "source": [ "eps = 1e-6\n", "\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + eps)\n", ")\n", "\n", "client_behavior[\"churn_flag\"] = (\n", " client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n", ").astype(int)\n", "\n", "client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n", "\n", "client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n", "\n", "client_behavior[\"inertia_ratio\"] = (\n", " 1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n", ")\n", "\n", "print(client_behavior.head())\n", "\n", "client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8ee7e911-eb73-4846-b545-661140411c1b", "metadata": {}, "outputs": [], "source": [ "# Diversification per account\n", "account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n", " n_isin_held=(\"Product - Isin\", \"nunique\"),\n", " n_funds_held=(\"Product - Fund\", \"nunique\"),\n", " n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n", " n_strategies=(\"Product - Strategy\", \"nunique\"),\n", " total_aum=(\"Value - AUM €\", \"sum\"),\n", " median_aum=(\"Value - AUM €\", \"median\")\n", ").reset_index()\n", "\n", "# Concentration ratio per account\n", "aum_by_account_fund = stocks.groupby(\n", " [\"Registrar Account - ID\", \"Product - Fund\"]\n", ")[\"Value - AUM €\"].sum().reset_index()\n", "\n", "concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n", " .reset_index(name=\"concentration_ratio\")\n", "\n", "# Merge diversification + concentration\n", "account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n", "\n", "print(account_static.head())\n", "print(account_static.describe())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "76f6fa0d-9d7a-4145-af1c-986d83947f91", "metadata": {}, "outputs": [], "source": [ "# Geographic info per account\n", "geo = stocks.groupby(\"Registrar Account - ID\").agg(\n", " country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n", " region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n", ").reset_index()\n", "\n", "print(geo.head())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c", "metadata": {}, "outputs": [], "source": [ "# 1. Merge behavior (flows) with static diversification (stocks)\n", "client_master = client_behavior.merge(\n", " account_static,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 2. Add geographic info\n", "client_master = client_master.merge(\n", " geo,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 3. Create additional engineered features\n", "client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n", "client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n", "\n", "\n", "# 4. Replace NaN flow volatility with 0 (inactive accounts)\n", "client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n", "\n", "# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n", "client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n", " client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n", "\n", "# 6. Fill missing geography as “UNKNOWN”\n", "client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n", "client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n", "\n", "# 7. Export\n", "client_master.to_csv(\"client_master.csv\", index=False)\n", "\n", "print(client_master.head())\n", "print(client_master.describe(include='all'))\n" ] }, { "cell_type": "markdown", "id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839", "metadata": {}, "source": [ "Détection des ruptures" ] }, { "cell_type": "code", "execution_count": null, "id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196", "metadata": {}, "outputs": [], "source": [ "# --- 1. PREPARE STOCKS ---\n", "stocks_clean = stocks[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - AUM\"\n", "]].copy()\n", "\n", "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n", "stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n", "\n", "# --- 2. PREPARE FLOWS ---\n", "flows_clean = flows[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - NetFlows\"\n", "]].copy()\n", "\n", "flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n", "\n", "# Aggregate flows per day to avoid duplicates\n", "flows_clean = flows_clean.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n", ")[\"Quantity - NetFlows\"].sum().reset_index()\n", "\n", "# --- 3. MERGE STOCKS WITH FLOWS ---\n", "merged = stocks_clean.merge(\n", " flows_clean,\n", " on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n", " how=\"left\"\n", ")\n", "\n", "merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n", "\n", "# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n", "merged[\"prev_stock\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - AUM\"].shift(1)\n", "\n", "# SHIFT NET FLOWS FROM PREVIOUS DATE\n", "merged[\"prev_netflows\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n", "\n", "# Expected stock\n", "merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n", "\n", "# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n", "merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n", "\n", "# tolerance for numerical noise\n", "TOL = 1e-6\n", "merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n", "\n", "# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n", "rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n", " n_ruptures=(\"rupture_flag\", \"sum\"),\n", " total_obs=(\"rupture_flag\", \"count\"),\n", " rupture_ratio=(\"rupture_flag\", \"mean\"),\n", " max_gap=(\"gap\", lambda x: x.abs().max())\n", ").reset_index()\n", "\n", "# Sort by biggest anomalies\n", "rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n", "\n", "rupture_summary.head(10)\n", "\n", "rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n", "rupture_summary_asc.to_csv('rupture.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06", "metadata": {}, "outputs": [], "source": [ "merged.to_csv('merged.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "71cd67aa-f4b9-489e-b928-defeca459cb6", "metadata": {}, "outputs": [], "source": [ "rupture_summary_asc" ] }, { "cell_type": "code", "execution_count": null, "id": "72332a7e-0ab0-474b-aac7-b52ebbba7a8b", "metadata": {}, "outputs": [], "source": [ "plot_account('200001928')\n", "plot_account('366351')\n", "plot_account('365966')\n", "plot_account('365568')\n", "plot_account('200129601')\n", "plot_account('402410')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "31407450-a833-4fce-8b0b-dba1b1de585f", "metadata": {}, "outputs": [], "source": [ "# 1. Prepare stock dataset ISIN-by-ISIN\n", "stocks_isin = stocks[[\n", " \"Registrar Account - ID\", \"Product - Isin\",\n", " \"Centralisation Date\", \"Quantity - AUM\"\n", "]].copy()\n", "\n", "stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n", "stocks_isin = stocks_isin.sort_values(\n", " [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n", ")\n", "\n", "# 2. Prepare flows dataset ISIN-by-ISIN\n", "flows_isin = flows[[\n", " \"Registrar Account - ID\", \"Product - Isin\",\n", " \"Centralisation Date\", \"Quantity - NetFlows\"\n", "]].copy()\n", "\n", "flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n", "\n", "flows_isin = flows_isin.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n", ")[\"Quantity - NetFlows\"].sum().reset_index()\n", "\n", "# 3. Merge stocks & flows ISIN-by-ISIN\n", "merged_isin = stocks_isin.merge(\n", " flows_isin,\n", " on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n", " how=\"left\"\n", ")\n", "\n", "merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n", "\n", "# 4. Compute expected stock per ISIN for each account\n", "merged_isin[\"prev_stock\"] = merged_isin.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - AUM\"].shift(1)\n", "\n", "merged_isin[\"prev_netflows\"] = merged_isin.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n", "\n", "merged_isin[\"expected_stock\"] = merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n", "\n", "# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n", "TOL = 1e-6\n", "merged_isin[\"gap\"] = merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n", "merged_isin[\"rupture_flag\"] = (\n", " merged_isin[\"prev_stock\"].notna()\n", " & (merged_isin[\"gap\"].abs() > TOL)\n", ")\n", "\n", "# 6. Summarize ruptures per (Account, ISIN)\n", "rupture_isin_summary = merged_isin.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ").agg(\n", " n_ruptures=(\"rupture_flag\", \"sum\"),\n", " obs=(\"rupture_flag\", \"count\"),\n", " rupture_ratio=(\"rupture_flag\", \"mean\"),\n", " max_gap=(\"gap\", lambda x: x.abs().max())\n", ").reset_index()\n", "\n", "# Sort by worst ISIN trajectories\n", "rupture_isin_summary = rupture_isin_summary.sort_values(\n", " \"rupture_ratio\", ascending=False\n", ")\n", "\n", "rupture_isin_summary.head(20)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "baa4b6cd-887d-45a6-af27-253a9aa8710f", "metadata": {}, "outputs": [], "source": [ "# Strong ruptures (ratio > 0.5 OR huge max_gap)\n", "strong = rupture_isin_summary[\n", " (rupture_isin_summary[\"rupture_ratio\"] > 0.5)\n", " | (rupture_isin_summary[\"max_gap\"] > 50000)\n", "]\n", "\n", "def find_successors(account_id, isin, window_days=15):\n", " # Extract rupture dates for (account_id, isin)\n", " ruptures = merged_isin[\n", " (merged_isin[\"Registrar Account - ID\"] == account_id)\n", " & (merged_isin[\"Product - Isin\"] == isin)\n", " & (merged_isin[\"rupture_flag\"])\n", " ][\"Centralisation Date\"].unique()\n", "\n", " if len(ruptures) == 0:\n", " return []\n", "\n", " candidates = []\n", "\n", " for rupture_date in ruptures:\n", " start = rupture_date - pd.Timedelta(days=window_days)\n", " end = rupture_date + pd.Timedelta(days=window_days)\n", "\n", " # Look for accounts with strong positive jump at the same time\n", " window_df = merged_isin[\n", " (merged_isin[\"Centralisation Date\"] >= start)\n", " & (merged_isin[\"Centralisation Date\"] <= end)\n", " & (merged_isin[\"Product - Isin\"] == isin)\n", " ]\n", "\n", " # Look for positive gap (jump)\n", " pos_jumps = window_df[window_df[\"gap\"] > 0]\n", "\n", " candidates.extend(pos_jumps[\"Registrar Account - ID\"].unique())\n", "\n", " # Remove self\n", " candidates = [c for c in candidates if c != account_id]\n", "\n", " return list(set(candidates))\n", "\n", "find_successors(\"200129601\", \"FR0010135103\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0b834da2-f781-476d-84a6-aebb38fb8dac", "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "df = merged_isin.copy()\n", "\n", "# Ajouter année / mois\n", "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n", "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n", "\n", "# 1. Nombre total de lignes par mois\n", "total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n", "\n", "# 2. Nombre de ruptures par mois\n", "ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n", "\n", "# 3. Merge pour obtenir total + ruptures\n", "ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n", "ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n", "\n", "# 4. Proportion (en %)\n", "ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n", "\n", "# 5. Pivot pour heatmap\n", "heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n", "\n", "# 6. Plot\n", "plt.figure(figsize=(14, 7))\n", "sns.heatmap(\n", " heatmap_ratio, \n", " cmap=\"Reds\",\n", " linewidths=.3,\n", " linecolor=\"grey\",\n", " annot=True,\n", " fmt=\".2%\",\n", " cbar_kws={'label': 'Proportion de ruptures'}\n", ")\n", "\n", "plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n", "plt.xlabel(\"Mois\")\n", "plt.ylabel(\"Année\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "aa5862ab-ec8e-47f8-8cb0-cd51503efed8", "metadata": {}, "outputs": [], "source": [ "df = merged_isin.copy()\n", "\n", "# Ajouter year / month au cas où\n", "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n", "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n", "\n", "# Merge géographique\n", "df = df.merge(\n", " geo[[\"Registrar Account - ID\", \"country\"]],\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n", "\n", "# Total des lignes par pays\n", "total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n", "\n", "# Nombre de ruptures\n", "rupt_country = (\n", " df[df[\"rupture_flag\"]]\n", " .groupby(\"country\")\n", " .size()\n", " .reset_index(name=\"ruptures\")\n", ")\n", "\n", "# Merge + ratios\n", "country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n", "country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n", "country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n", "\n", "# Tri (rupture ratio décroissant)\n", "country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "86d2a91c-d8d8-416c-8dc4-dc3f4ae7ca90", "metadata": {}, "outputs": [], "source": [ "# On ajoute une colonne en % pour l’affichage\n", "country_stats_plot = country_stats.copy()\n", "country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n", "\n", "# Tri décroissant par proportion de ruptures\n", "country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n", "\n", "fig = px.bar(\n", " country_stats_plot,\n", " x=\"country\",\n", " y=\"rupture_ratio\",\n", " hover_data={\n", " \"rupture_pct\": ':.2f',\n", " \"ruptures\": True,\n", " \"total_obs\": True,\n", " \"rupture_ratio\": False, # on cache la version décimale\n", " },\n", " labels={\n", " \"country\": \"Pays\",\n", " \"rupture_ratio\": \"Proportion de ruptures\",\n", " \"rupture_pct\": \"% de ruptures\",\n", " \"ruptures\": \"Nb de ruptures\",\n", " \"total_obs\": \"Nb d'observations\"\n", " },\n", " title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n", ")\n", "\n", "# Format en %\n", "fig.update_yaxes(tickformat=\".1%\")\n", "\n", "fig.update_layout(\n", " xaxis_tickangle=-45,\n", " bargap=0.2\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e1c114db-5fbd-4cd3-a897-b9d4c96053fd", "metadata": {}, "outputs": [], "source": [ "df[df['country']=='JAPAN'].to_csv('Japan.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "9fec57f0-dd80-47bc-aacb-518c0ac0a4f6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "95bc353d-e883-4989-aaca-1b3c9b51ee5a", "metadata": {}, "outputs": [], "source": [ "rs = rupture_summary.copy()\n", "\n", "# 1. Stats numériques classiques\n", "print(\"\\n=== BASIC NUMERIC STATS ===\")\n", "print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n", "\n", "\n", "# 2. Distribution par classes (bins)\n", "\n", "rs[\"rupture_bucket\"] = pd.cut(\n", " rs[\"rupture_ratio\"],\n", " bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n", " labels=[\n", " \"0–0.1%\",\n", " \"0.1–1%\",\n", " \"1–5%\",\n", " \"5–10%\",\n", " \"10–25%\",\n", " \"25–50%\",\n", " \"50–100%\"\n", " ],\n", " include_lowest=True\n", ")\n", "\n", "# Ajouter la catégorie \"0%\"\n", "rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n", "\n", "# Remplacer les 0% exacts\n", "rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n", "\n", "bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n", "print(bucket_counts)\n", "\n", "\n", "# 3. Pourcentages\n", "bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n", "\n", "print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n", "print(bucket_percent)\n", "\n", "\n", "# 4. Nombre de comptes totalement propres\n", "no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n", "print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n", "\n", "# 5. Comptes extrêmement problématiques\n", "severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n", "print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n", "\n", "medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n", "print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "86d8fe0e-fa6c-46df-bb4c-054d1a677b38", "metadata": {}, "outputs": [], "source": [ "import plotly.express as px\n", "\n", "fig = px.histogram(\n", " rs,\n", " x=\"rupture_ratio\",\n", " nbins=50,\n", " title=\"Distribution du rupture_ratio\",\n", " labels={\"rupture_ratio\": \"Rupture Ratio\"},\n", ")\n", "fig.update_layout(bargap=0.05)\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "425b36d0-c92a-4405-be28-35b1fc292fec", "metadata": {}, "outputs": [], "source": [ "# --- 1. Filtres de base ---\n", "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n", "\n", "# Filtrer uniquement l'année 2021\n", "ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n", "\n", "print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n", "\n", "# --- 2. Classification du type de gap ---\n", "ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n", "\n", "# --- 3. Statistiques globales ---\n", "gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n", "gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n", "\n", "print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n", "print(gap_counts)\n", "print(\"\\n(%)\")\n", "print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n", "\n", "# --- 4. Intensité des écarts ---\n", "intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n", "print(\"\\n=== STATISTIQUES DES GAPS ===\")\n", "print(intensity_stats)\n", "\n", "# --- 5. Visualisation rapide ---\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(10,5))\n", "sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n", "plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n", "plt.title(\"Distribution des gaps de rupture en 2021\")\n", "plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "df9e0005-93f2-4885-baef-2e54921a42f4", "metadata": {}, "outputs": [], "source": [ "# --- 1. ADD YEAR ---\n", "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n", "\n", "# --- 2. DEFINE PERIODS ---\n", "conditions = [\n", " merged[\"year\"] < 2021,\n", " merged[\"year\"] == 2021,\n", " merged[\"year\"] > 2021\n", "]\n", "\n", "period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n", "\n", "merged[\"period\"] = np.select(\n", " conditions,\n", " period_labels,\n", " default=\"unknown\"\n", ")\n", "\n", "# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n", "merged[\"gap_type\"] = np.where(\n", " merged[\"gap\"] > 0, \"positive\",\n", " np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n", ")\n", "\n", "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n", "\n", "# --- 4. TOTAL OBS PER PERIOD ---\n", "total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n", "\n", "# --- 5. TOTAL RUPTURES PER PERIOD ---\n", "rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n", "\n", "# --- 6. PROPORTION OF RUPTURES ---\n", "rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n", "\n", "# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n", "gap_dist = (\n", " ruptures.groupby([\"period\", \"gap_type\"])\n", " .size()\n", " .groupby(level=0)\n", " .apply(lambda x: (x / x.sum()) * 100) # % par période\n", ")\n", "\n", "\n", "# --- 8. MERGE AND DISPLAY ---\n", "summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n", "summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n", "\n", "print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n", "print(summary)\n", "\n", "print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n", "print(gap_dist)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "195205bd-d817-41f9-a0fd-18d8b804515f", "metadata": {}, "outputs": [], "source": [ "# --- 1. DEFINE PERIODS ---\n", "merged[\"period2\"] = np.where(\n", " merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n", " \"Before Sep 2021\",\n", " \"After Sep 2021\"\n", ")\n", "\n", "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n", "\n", "# --- 2. Ensure gap_type exists + no missing categories ---\n", "ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"}) # zero is equivalent to no-flow change\n", "\n", "# --- 3. Compute gap counts ---\n", "gap_counts = (\n", " ruptures.groupby([\"period2\", \"gap_type\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "\n", "# Ensure both columns exist\n", "for col in [\"positive\", \"negative\"]:\n", " if col not in gap_counts.columns:\n", " gap_counts[col] = 0\n", "\n", "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n", "\n", "# --- 4. Extract values ---\n", "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n", "after_vals = gap_counts.loc[\"After Sep 2021\"].values\n", "\n", "# --- 5. MAKE TWO DONUT CHARTS ---\n", "fig = make_subplots(\n", " rows=1, cols=2,\n", " specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n", " subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n", ")\n", "\n", "fig.add_trace(\n", " go.Pie(\n", " labels=[\"Negative gaps\", \"Positive gaps\"],\n", " values=before_vals,\n", " marker_colors=[\"#E67E22\", \"#3498DB\"],\n", " hole=0.45,\n", " textinfo=\"label+percent\"\n", " ),\n", " row=1, col=1\n", ")\n", "\n", "fig.add_trace(\n", " go.Pie(\n", " labels=[\"Negative gaps\", \"Positive gaps\"],\n", " values=after_vals,\n", " marker_colors=[\"#E67E22\", \"#3498DB\"],\n", " hole=0.45,\n", " textinfo=\"label+percent\"\n", " ),\n", " row=1, col=2\n", ")\n", "\n", "fig.update_layout(\n", " title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n", " showlegend=True\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9583f188-8601-425b-908f-61c2ee1f8da2", "metadata": {}, "outputs": [], "source": [ "import plotly.graph_objects as go\n", "\n", "# --- 1. Compute gap counts by period ---\n", "gap_counts = (\n", " ruptures.groupby([\"period2\", \"gap_type\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "\n", "# Ensure both columns exist\n", "for col in [\"positive\", \"negative\"]:\n", " if col not in gap_counts.columns:\n", " gap_counts[col] = 0\n", "\n", "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n", "\n", "# --- 2. Extract values ---\n", "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n", "after_vals = gap_counts.loc[\"After Sep 2021\"].values\n", "\n", "# --- 3. Plot : TWO PIE CHARTS side by side ---\n", "fig = make_subplots(\n", " rows=1, cols=2,\n", " specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n", " subplot_titles=(\"Before 2021\", \"After 2021\")\n", ")\n", "\n", "fig.add_trace(\n", " go.Pie(\n", " labels=[\"Negative gaps\", \"Positive gaps\"],\n", " values=before_vals,\n", " marker_colors=[\"#E67E22\", \"#3498DB\"],\n", " hole=0.35\n", " ),\n", " row=1, col=1\n", ")\n", "\n", "fig.add_trace(\n", " go.Pie(\n", " labels=[\"Negative gaps\", \"Positive gaps\"],\n", " values=after_vals,\n", " marker_colors=[\"#E67E22\", \"#3498DB\"],\n", " hole=0.35\n", " ),\n", " row=1, col=2\n", ")\n", "\n", "fig.update_layout(\n", " title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f4e29536-eeed-4b91-a59c-b373cf14a5fc", "metadata": {}, "outputs": [], "source": [ "import plotly.graph_objects as go\n", "\n", "# --- 1. Define periods ---\n", "merged[\"period2\"] = np.where(\n", " merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n", " \"Before Sep 2021\",\n", " \"After Sep 2021\"\n", ")\n", "\n", "# --- 2. Keep only ruptures ---\n", "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n", "\n", "# --- 3. Count ruptures per period ---\n", "rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n", " [\"Before Sep 2021\", \"After Sep 2021\"]\n", ").fillna(0)\n", "\n", "# --- 4. Pie chart ---\n", "fig = go.Figure(data=[\n", " go.Pie(\n", " labels=rupture_counts.index,\n", " values=rupture_counts.values,\n", " hole=0.45,\n", " marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n", " textinfo=\"percent+value\",\n", " )\n", "])\n", "\n", "fig.update_layout(\n", " title=\"Répartition des ruptures\"\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "553f91fa-5017-4685-ab31-afe2aa247e13", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "# 1. Filtre sur la période post-Sept 2021\n", "cutoff = pd.Timestamp(\"2021-09-01\")\n", "post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n", "\n", "# 2. On ne garde que les ruptures\n", "post_rupt = post[post[\"rupture_flag\"] == True].copy()\n", "\n", "# 3. Gap absolu + gap relatif (% du stock)\n", "post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n", "post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n", "\n", "# 4. Percentiles globaux\n", "p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n", "p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n", "p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n", "\n", "# 5. Classification automatique\n", "def classify_gap(gap, gap_rel, acct):\n", " # RESET → énorme choc (technique)\n", " if gap_abs >= p99 or gap_rel >= 0.90:\n", " return \"reset\"\n", "\n", " # SPIKE → très gros gap mais isolé\n", " if gap_abs >= p95:\n", " return \"spike\"\n", "\n", " # SHIFT → décalage permanent\n", " # Test : moyenne des gaps du compte\n", " return None\n", "\n", "# Calcul du shift (décalage directionnel)\n", "shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n", "\n", "post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n", "\n", "post_rupt[\"gap_type2\"] = np.where(\n", " post_rupt[\"gap_abs\"] >= p99, \"reset\",\n", " np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n", " np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n", " \n", "# 6. Statistiques globales\n", "stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n", "print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n", "print(stats)\n", "\n", "# 7. Stats par client\n", "client_stats = (\n", " post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n", " .value_counts(normalize=True)\n", " .rename(\"ratio\")\n", " .mul(100)\n", " .reset_index()\n", ")\n", "\n", "# 8. Stats par ISIN\n", "isin_stats = (\n", " post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n", " .value_counts(normalize=True)\n", " .rename(\"ratio\")\n", " .mul(100)\n", " .reset_index()\n", ")\n", "\n", "print(\"\\n=== TOP ISIN PAR RESET ===\")\n", "print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n", "\n", "print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n", "print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f4396f2c-a8b6-4ea9-8292-093f900bf260", "metadata": {}, "outputs": [], "source": [ "import plotly.graph_objects as go\n", "\n", "# --- Data from your output ---\n", "labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n", "values = [50.4, 44.6, 4.0, 1.0]\n", "\n", "# --- Pie chart ---\n", "fig = go.Figure(\n", " data=[go.Pie(\n", " labels=labels,\n", " values=values,\n", " hole=0.35, # donut style (plus lisible)\n", " textinfo='percent',\n", " marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n", " )]\n", ")\n", "\n", "fig.update_layout(\n", " title=\"Typologie des ruptures depuis Septembre 2021\",\n", " legend_title=\"Type de gap\",\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3df1f839-44d4-4894-bdfa-6851971d1983", "metadata": {}, "outputs": [], "source": [ "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n", "\n", "yearly_stats = merged.groupby(\"year\").agg(\n", " total_obs=(\"gap\", \"count\"),\n", " ruptures=(\"rupture_flag\", \"sum\")\n", ").reset_index()\n", "\n", "yearly_stats[\"rupture_rate\"] = (\n", " yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f312684d-0815-439c-a632-cadd1cbb779c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.8" } }, "nbformat": 4, "nbformat_minor": 5 }