Project_Carmignac/dataloader.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2e8cf88b-cecf-409f-9c2d-c3762b233f05",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n",
      "Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install openpyxl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ff2261fb-9516-4410-b42d-3acc8dc1a460",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n",
    "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n",
    "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n",
    "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
    "fs = s3fs.S3FileSystem(\n",
    "    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
    "    key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
    "    secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
    "    token = os.environ[\"AWS_SESSION_TOKEN\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5",
   "metadata": {},
   "source": [
    "# data exploration"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaf5c5a0-eb1c-4242-b893-7600e6def109",
   "metadata": {},
   "source": [
    "Fonctions utiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "60e2035c-c2f0-4c51-97df-102e67ba96ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_account(account_id, isin=None):\n",
    "    \"\"\"\n",
    "    Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n",
    "    Optionally, only for one ISIN.\n",
    "    \"\"\"\n",
    "\n",
    "    df = merged.copy()\n",
    "\n",
    "    # Filter by account\n",
    "    df = df[df[\"Registrar Account - ID\"] == account_id]\n",
    "\n",
    "    if isin is not None:\n",
    "        df = df[df[\"Product - Isin\"] == isin]\n",
    "\n",
    "    if df.empty:\n",
    "        print(f\"No data found for account {account_id}\")\n",
    "        return\n",
    "\n",
    "    df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n",
    "\n",
    "    df_plot = df_plot.sort_values(\"Centralisation Date\")\n",
    "\n",
    "    # Plot\n",
    "    plt.figure(figsize=(12, 4))\n",
    "    plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n",
    "    plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n",
    "    plt.xlabel(\"Date\")\n",
    "    plt.ylabel(\"Total AUM\")\n",
    "    plt.grid(True)\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "37e008b1-32d4-44be-9d23-1b90a5a26f89",
   "metadata": {},
   "outputs": [],
   "source": [
    "#   2. BASIC INSPECTION\n",
    "\n",
    "def quick_info(df, name):\n",
    "    print(\"\\n\" + \"=\"*80)\n",
    "    print(f\"DATASET : {name}\")\n",
    "    print(\"=\"*80)\n",
    "    print(\"\\nShape :\", df.shape)\n",
    "    print(\"\\nColumns :\", df.columns.tolist())\n",
    "    print(\"\\nDtypes :\\n\", df.dtypes)\n",
    "    print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n",
    "    print(\"\\nSample rows:\\n\", df.head(5))\n",
    "    print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e104a416-4cfd-43b9-b9ec-6af1fce700da",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'R9J6QLRZDYWLSWKBJ7IA'\n",
    "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'sFi4J78xigTJEXdY02bQL2i5KNwyYL7VZpMe0XJs'\n",
    "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJSOUo2UUxSWkRZV0xTV0tCSjdJQSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2NDM0Mzc0MSwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjU1NTM4NjYsImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2NDM0NDI2NiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiIxZWEzOTgwMi0zZGNiLTRlMzEtYTkwMS1jNGE4M2Q5ZjQyYzQiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjZlMDBhN2QxLTMxYWQtNGE3Ny04ZmE3LTBlYTc1Y2VhZTQwMCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.gUxGEq4iWA5eJVWYj-JyDZGYzTnWmQH92iYU-2_5P4n3erGFFZvz8wJe5keTAhcKKTycZKiWltnZpCNsRQ0vOg'\n",
    "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n",
    "fs = s3fs.S3FileSystem(\n",
    "    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
    "    key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
    "    secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
    "    token = os.environ[\"AWS_SESSION_TOKEN\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e67a99ea-ddf4-4627-8f48-ec183c671acb",
   "metadata": {},
   "outputs": [
    {
     "ename": "PermissionError",
     "evalue": "Forbidden",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mClientError\u001b[39m                               Traceback (most recent call last)",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:114\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m    113\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n\u001b[32m    115\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/context.py:36\u001b[39m, in \u001b[36mwith_current_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     35\u001b[39m     \u001b[38;5;28;01mawait\u001b[39;00m resolve_awaitable(hook())\n\u001b[32m---> \u001b[39m\u001b[32m36\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(*args, **kwargs)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/aiobotocore/client.py:424\u001b[39m, in \u001b[36mAioBaseClient._make_api_call\u001b[39m\u001b[34m(self, operation_name, api_params)\u001b[39m\n\u001b[32m    423\u001b[39m     error_class = \u001b[38;5;28mself\u001b[39m.exceptions.from_code(error_code)\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[32m    425\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[31mClientError\u001b[39m: An error occurred (403) when calling the HeadObject operation: Forbidden",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[31mPermissionError\u001b[39m                           Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[33;03mwith fs.open(\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[33;03m    \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m      6\u001b[39m \u001b[33;03m    flows = pd.read_csv(f, sep=\";\")\u001b[39;00m\n\u001b[32m      7\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mprojet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m     10\u001b[39m     nav_raw = pd.read_excel(f, header=\u001b[38;5;28;01mNone\u001b[39;00m, engine=\u001b[33m\"\u001b[39m\u001b[33mopenpyxl\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     11\u001b[39m nav = nav_raw[\u001b[32m0\u001b[39m].str.split(\u001b[33m\"\u001b[39m\u001b[33m,\u001b[39m\u001b[33m\"\u001b[39m, expand=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1338\u001b[39m, in \u001b[36mAbstractFileSystem.open\u001b[39m\u001b[34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[39m\n\u001b[32m   1336\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1337\u001b[39m     ac = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33mautocommit\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._intrans)\n\u001b[32m-> \u001b[39m\u001b[32m1338\u001b[39m     f = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1339\u001b[39m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1340\u001b[39m \u001b[43m        \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1341\u001b[39m \u001b[43m        \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1342\u001b[39m \u001b[43m        \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1343\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1344\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1345\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1346\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1347\u001b[39m         \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfsspec\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcompression\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m compr\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:720\u001b[39m, in \u001b[36mS3FileSystem._open\u001b[39m\u001b[34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, size, requester_pays, cache_options, **kwargs)\u001b[39m\n\u001b[32m    717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    718\u001b[39m     cache_type = \u001b[38;5;28mself\u001b[39m.default_cache_type\n\u001b[32m--> \u001b[39m\u001b[32m720\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    721\u001b[39m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m    722\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    723\u001b[39m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    724\u001b[39m \u001b[43m    \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    725\u001b[39m \u001b[43m    \u001b[49m\u001b[43macl\u001b[49m\u001b[43m=\u001b[49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    726\u001b[39m \u001b[43m    \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    727\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    728\u001b[39m \u001b[43m    \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    729\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    730\u001b[39m \u001b[43m    \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    731\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    732\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    733\u001b[39m \u001b[43m    \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    734\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:2257\u001b[39m, in \u001b[36mS3File.__init__\u001b[39m\u001b[34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options, size)\u001b[39m\n\u001b[32m   2255\u001b[39m         \u001b[38;5;28mself\u001b[39m.details = s3.info(path)\n\u001b[32m   2256\u001b[39m         \u001b[38;5;28mself\u001b[39m.version_id = \u001b[38;5;28mself\u001b[39m.details.get(\u001b[33m\"\u001b[39m\u001b[33mVersionId\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2257\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m   2258\u001b[39m \u001b[43m    \u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2259\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2260\u001b[39m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2261\u001b[39m \u001b[43m    \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2262\u001b[39m \u001b[43m    \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2263\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2264\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2265\u001b[39m \u001b[43m    \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2266\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   2267\u001b[39m \u001b[38;5;28mself\u001b[39m.s3 = \u001b[38;5;28mself\u001b[39m.fs  \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[32m   2269\u001b[39m \u001b[38;5;66;03m# when not using autocommit we want to have transactional state to manage\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1912\u001b[39m, in \u001b[36mAbstractBufferedFile.__init__\u001b[39m\u001b[34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[39m\n\u001b[32m   1910\u001b[39m         \u001b[38;5;28mself\u001b[39m.size = size\n\u001b[32m   1911\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1912\u001b[39m         \u001b[38;5;28mself\u001b[39m.size = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdetails\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33msize\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m   1913\u001b[39m     \u001b[38;5;28mself\u001b[39m.cache = caches[cache_type](\n\u001b[32m   1914\u001b[39m         \u001b[38;5;28mself\u001b[39m.blocksize, \u001b[38;5;28mself\u001b[39m._fetch_range, \u001b[38;5;28mself\u001b[39m.size, **cache_options\n\u001b[32m   1915\u001b[39m     )\n\u001b[32m   1916\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/spec.py:1925\u001b[39m, in \u001b[36mAbstractBufferedFile.details\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m   1922\u001b[39m \u001b[38;5;129m@property\u001b[39m\n\u001b[32m   1923\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m   1924\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1925\u001b[39m         \u001b[38;5;28mself\u001b[39m._details = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfs\u001b[49m\u001b[43m.\u001b[49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1926\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._details\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:118\u001b[39m, in \u001b[36msync_wrapper.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    115\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m    116\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args, **kwargs):\n\u001b[32m    117\u001b[39m     \u001b[38;5;28mself\u001b[39m = obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[32m0\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:103\u001b[39m, in \u001b[36msync\u001b[39m\u001b[34m(loop, func, timeout, *args, **kwargs)\u001b[39m\n\u001b[32m    101\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m FSTimeoutError \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreturn_result\u001b[39;00m\n\u001b[32m    102\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[32m    104\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    105\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m return_result\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/fsspec/asyn.py:56\u001b[39m, in \u001b[36m_runner\u001b[39m\u001b[34m(event, coro, result, timeout)\u001b[39m\n\u001b[32m     54\u001b[39m     coro = asyncio.wait_for(coro, timeout=timeout)\n\u001b[32m     55\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m     result[\u001b[32m0\u001b[39m] = \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[32m     57\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m     58\u001b[39m     result[\u001b[32m0\u001b[39m] = ex\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:1445\u001b[39m, in \u001b[36mS3FileSystem._info\u001b[39m\u001b[34m(self, path, bucket, key, refresh, version_id)\u001b[39m\n\u001b[32m   1443\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key:\n\u001b[32m   1444\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1445\u001b[39m         out = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_s3(\n\u001b[32m   1446\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33mhead_object\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m   1447\u001b[39m             \u001b[38;5;28mself\u001b[39m.kwargs,\n\u001b[32m   1448\u001b[39m             Bucket=bucket,\n\u001b[32m   1449\u001b[39m             Key=key,\n\u001b[32m   1450\u001b[39m             **version_id_kw(version_id),\n\u001b[32m   1451\u001b[39m             **\u001b[38;5;28mself\u001b[39m.req_kw,\n\u001b[32m   1452\u001b[39m         )\n\u001b[32m   1453\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m   1454\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mETag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m   1455\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mLastModified\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m   (...)\u001b[39m\u001b[32m   1461\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m: out.get(\u001b[33m\"\u001b[39m\u001b[33mContentType\u001b[39m\u001b[33m\"\u001b[39m),\n\u001b[32m   1462\u001b[39m         }\n\u001b[32m   1463\u001b[39m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:371\u001b[39m, in \u001b[36mS3FileSystem._call_s3\u001b[39m\u001b[34m(self, method, *akwarglist, **kwargs)\u001b[39m\n\u001b[32m    369\u001b[39m logger.debug(\u001b[33m\"\u001b[39m\u001b[33mCALL: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, method.\u001b[34m__name__\u001b[39m, akwarglist, kw2)\n\u001b[32m    370\u001b[39m additional_kwargs = \u001b[38;5;28mself\u001b[39m._get_s3_method_kwargs(method, *akwarglist, **kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(\n\u001b[32m    372\u001b[39m     method, kwargs=additional_kwargs, retries=\u001b[38;5;28mself\u001b[39m.retries\n\u001b[32m    373\u001b[39m )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/python/lib/python3.13/site-packages/s3fs/core.py:146\u001b[39m, in \u001b[36m_error_wrapper\u001b[39m\u001b[34m(func, args, kwargs, retries)\u001b[39m\n\u001b[32m    144\u001b[39m         err = e\n\u001b[32m    145\u001b[39m err = translate_boto_error(err)\n\u001b[32m--> \u001b[39m\u001b[32m146\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m err\n",
      "\u001b[31mPermissionError\u001b[39m: Forbidden"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "with fs.open(\n",
    "    \"projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\",\n",
    "    \"rb\"\n",
    ") as f:\n",
    "    flows = pd.read_csv(f, sep=\";\")\n",
    "\"\"\"\n",
    "\n",
    "with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n",
    "    nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n",
    "nav = nav_raw[0].str.split(\",\", expand=True)\n",
    "nav.columns = nav.iloc[0]\n",
    "with fs.open('projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
    "    stocks = pd.read_csv(f, sep=\";\")\n",
    "\n",
    "nav = nav[1:].reset_index(drop=True)\n",
    "\n",
    "quick_info(stocks, \"STOCKS\")\n",
    "quick_info(flows,  \"FLOWS\")\n",
    "quick_info(nav,    \"NAV/PRICES\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. CLEAN DATES (formats différents)\n",
    "\n",
    "stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n",
    "flows[\"Centralisation Date\"]  = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n",
    "nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n",
    "\n",
    "print(\"Date conversion done.\")\n",
    "\n",
    "# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n",
    "\n",
    "num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n",
    "            \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n",
    "\n",
    "for col in num_cols:\n",
    "    nav[col] = (\n",
    "        nav[col]\n",
    "        .astype(str)\n",
    "        .str.replace(\",\", \".\", regex=False)\n",
    "        .str.replace(\" \", \"\")\n",
    "        .astype(float)\n",
    "    )\n",
    "\n",
    "print(\"NAV numeric conversion done.\")\n",
    "\n",
    "# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n",
    "\n",
    "def norm(df):\n",
    "    for col in df.columns:\n",
    "        if df[col].dtype == \"object\":\n",
    "            df[col] = df[col].astype(str).str.strip().str.upper()\n",
    "    return df\n",
    "\n",
    "stocks = norm(stocks)\n",
    "flows = norm(flows)\n",
    "nav = norm(nav)\n",
    "\n",
    "print(\"String normalization done.\")\n",
    "\n",
    "\n",
    "# 4. ANALYSE RELATIONS ACROSS FILES\n",
    "\n",
    "# Unique sets\n",
    "isin_stocks = set(stocks[\"Product - Isin\"].unique())\n",
    "isin_flows  = set(flows[\"Product - Isin\"].unique())\n",
    "isin_nav    = set(nav[\"ShareClassIsin\"].unique())\n",
    "\n",
    "print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n",
    "print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n",
    "print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n",
    "print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n",
    "\n",
    "\n",
    "# 5. CLIENTS: STOCKS VS FLOWS\n",
    "\n",
    "acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n",
    "acc_flows  = set(flows[\"Registrar Account - ID\"].unique())\n",
    "\n",
    "print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n",
    "print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n",
    "\n",
    "\n",
    "# 6. CLIENT ACTIVITY METRICS (DETAILED)\n",
    "\n",
    "client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n",
    "    n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
    "    n_transactions=(\"Quantity - NetFlows\", \"count\"),\n",
    "    total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
    "    mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n",
    "    std_flow=(\"Quantity - NetFlows\", \"std\"),\n",
    "    total_subscription=(\"Quantity - Subscription\", \"sum\"),\n",
    "    total_redemption=(\"Quantity - Redemption\", \"sum\")\n",
    ").reset_index()\n",
    "\n",
    "# Add churn metric\n",
    "client_behavior[\"churn_ratio\"] = (\n",
    "    client_behavior[\"total_redemption\"] /\n",
    "    (client_behavior[\"total_subscription\"] + 1e-9)\n",
    ")\n",
    "\n",
    "print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n",
    "\n",
    "\n",
    "# 7. FUNDS ACTIVITY METRICS\n",
    "\n",
    "fund_behavior = flows.groupby(\"Product - Isin\").agg(\n",
    "    n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n",
    "    n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n",
    "    total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n",
    "    vol_flows=(\"Quantity - NetFlows\", \"std\")\n",
    ").reset_index()\n",
    "\n",
    "print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n",
    "\n",
    "\n",
    "# 8. SAVE INTERMEDIATE\n",
    "\n",
    "client_behavior.to_csv(\"client_behavior.csv\", index=False)\n",
    "fund_behavior.to_csv(\"fund_behavior.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_full = isin_stocks & isin_flows & isin_nav\n",
    "stocks_only = isin_stocks - isin_flows\n",
    "flows_only = isin_flows - isin_stocks\n",
    "missing_nav = (isin_stocks | isin_flows) - isin_nav\n",
    "\n",
    "print(\"FULL usable ISIN :\", len(valid_full))\n",
    "print(\"Stocks only ISIN :\", len(stocks_only))\n",
    "print(\"Flows only ISIN :\", len(flows_only))\n",
    "print(\"Missing NAV :\", len(missing_nav))\n",
    "\n",
    "pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n",
    "pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n",
    "pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n",
    "pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n",
    "\n",
    "print(\"All ISIN groups saved into 4 separate files.\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "eps = 1e-6\n",
    "\n",
    "client_behavior[\"churn_ratio\"] = (\n",
    "    client_behavior[\"total_redemption\"] /\n",
    "    (client_behavior[\"total_subscription\"] + eps)\n",
    ")\n",
    "\n",
    "client_behavior[\"churn_flag\"] = (\n",
    "    client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n",
    ").astype(int)\n",
    "\n",
    "client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n",
    "\n",
    "client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n",
    "\n",
    "client_behavior[\"inertia_ratio\"] = (\n",
    "    1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n",
    ")\n",
    "\n",
    "print(client_behavior.head())\n",
    "\n",
    "client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ee7e911-eb73-4846-b545-661140411c1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Diversification per account\n",
    "account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n",
    "    n_isin_held=(\"Product - Isin\", \"nunique\"),\n",
    "    n_funds_held=(\"Product - Fund\", \"nunique\"),\n",
    "    n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n",
    "    n_strategies=(\"Product - Strategy\", \"nunique\"),\n",
    "    total_aum=(\"Value - AUM €\", \"sum\"),\n",
    "    median_aum=(\"Value - AUM €\", \"median\")\n",
    ").reset_index()\n",
    "\n",
    "# Concentration ratio per account\n",
    "aum_by_account_fund = stocks.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Fund\"]\n",
    ")[\"Value - AUM €\"].sum().reset_index()\n",
    "\n",
    "concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n",
    "    .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n",
    "    .reset_index(name=\"concentration_ratio\")\n",
    "\n",
    "# Merge diversification + concentration\n",
    "account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "print(account_static.head())\n",
    "print(account_static.describe())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76f6fa0d-9d7a-4145-af1c-986d83947f91",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Geographic info per account\n",
    "geo = stocks.groupby(\"Registrar Account - ID\").agg(\n",
    "    country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n",
    "    region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n",
    ").reset_index()\n",
    "\n",
    "print(geo.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Merge behavior (flows) with static diversification (stocks)\n",
    "client_master = client_behavior.merge(\n",
    "    account_static,\n",
    "    on=\"Registrar Account - ID\",\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "# 2. Add geographic info\n",
    "client_master = client_master.merge(\n",
    "    geo,\n",
    "    on=\"Registrar Account - ID\",\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "# 3. Create additional engineered features\n",
    "client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n",
    "client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n",
    "\n",
    "\n",
    "# 4. Replace NaN flow volatility with 0 (inactive accounts)\n",
    "client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n",
    "\n",
    "# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n",
    "client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n",
    "    client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n",
    "\n",
    "# 6. Fill missing geography as “UNKNOWN”\n",
    "client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n",
    "client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n",
    "\n",
    "# 7. Export\n",
    "client_master.to_csv(\"client_master.csv\", index=False)\n",
    "\n",
    "print(client_master.head())\n",
    "print(client_master.describe(include='all'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839",
   "metadata": {},
   "source": [
    "Détection des ruptures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. PREPARE STOCKS ---\n",
    "stocks_clean = stocks[[\n",
    "    \"Registrar Account - ID\", \"Product - Isin\", \n",
    "    \"Centralisation Date\", \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n",
    "stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n",
    "\n",
    "# --- 2. PREPARE FLOWS ---\n",
    "flows_clean = flows[[\n",
    "    \"Registrar Account - ID\", \"Product - Isin\", \n",
    "    \"Centralisation Date\", \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n",
    "\n",
    "# Aggregate flows per day to avoid duplicates\n",
    "flows_clean = flows_clean.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")[\"Quantity - NetFlows\"].sum().reset_index()\n",
    "\n",
    "# --- 3. MERGE STOCKS WITH FLOWS ---\n",
    "merged = stocks_clean.merge(\n",
    "    flows_clean,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n",
    "merged[\"prev_stock\"] = merged.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "# SHIFT NET FLOWS FROM PREVIOUS DATE\n",
    "merged[\"prev_netflows\"] = merged.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "# Expected stock\n",
    "merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n",
    "\n",
    "# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n",
    "merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n",
    "\n",
    "# tolerance for numerical noise\n",
    "TOL = 1e-6\n",
    "merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n",
    "\n",
    "# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n",
    "rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n",
    "    n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "    total_obs=(\"rupture_flag\", \"count\"),\n",
    "    rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "    max_gap=(\"gap\", lambda x: x.abs().max())\n",
    ").reset_index()\n",
    "\n",
    "# Sort by biggest anomalies\n",
    "rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n",
    "\n",
    "rupture_summary.head(10)\n",
    "\n",
    "rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n",
    "rupture_summary_asc.to_csv('rupture.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged.to_csv('merged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71cd67aa-f4b9-489e-b928-defeca459cb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "rupture_summary_asc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72332a7e-0ab0-474b-aac7-b52ebbba7a8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_account('200001928')\n",
    "plot_account('366351')\n",
    "plot_account('365966')\n",
    "plot_account('365568')\n",
    "plot_account('200129601')\n",
    "plot_account('402410')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31407450-a833-4fce-8b0b-dba1b1de585f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Prepare stock dataset ISIN-by-ISIN\n",
    "stocks_isin = stocks[[\n",
    "    \"Registrar Account - ID\", \"Product - Isin\",\n",
    "    \"Centralisation Date\", \"Quantity - AUM\"\n",
    "]].copy()\n",
    "\n",
    "stocks_isin[\"Centralisation Date\"] = pd.to_datetime(stocks_isin[\"Centralisation Date\"])\n",
    "stocks_isin = stocks_isin.sort_values(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")\n",
    "\n",
    "# 2. Prepare flows dataset ISIN-by-ISIN\n",
    "flows_isin = flows[[\n",
    "    \"Registrar Account - ID\", \"Product - Isin\",\n",
    "    \"Centralisation Date\", \"Quantity - NetFlows\"\n",
    "]].copy()\n",
    "\n",
    "flows_isin[\"Centralisation Date\"] = pd.to_datetime(flows_isin[\"Centralisation Date\"])\n",
    "\n",
    "flows_isin = flows_isin.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n",
    ")[\"Quantity - NetFlows\"].sum().reset_index()\n",
    "\n",
    "# 3. Merge stocks & flows ISIN-by-ISIN\n",
    "merged_isin = stocks_isin.merge(\n",
    "    flows_isin,\n",
    "    on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "merged_isin[\"Quantity - NetFlows\"] = merged_isin[\"Quantity - NetFlows\"].fillna(0)\n",
    "\n",
    "# 4. Compute expected stock per ISIN for each account\n",
    "merged_isin[\"prev_stock\"] = merged_isin.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - AUM\"].shift(1)\n",
    "\n",
    "merged_isin[\"prev_netflows\"] = merged_isin.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n",
    "\n",
    "merged_isin[\"expected_stock\"] = merged_isin[\"prev_stock\"] + merged_isin[\"prev_netflows\"]\n",
    "\n",
    "# 5. Detect ruptures ISIN-by-ISIN (no aggregation)\n",
    "TOL = 1e-6\n",
    "merged_isin[\"gap\"] = merged_isin[\"Quantity - AUM\"] - merged_isin[\"expected_stock\"]\n",
    "merged_isin[\"rupture_flag\"] = (\n",
    "    merged_isin[\"prev_stock\"].notna()\n",
    "    & (merged_isin[\"gap\"].abs() > TOL)\n",
    ")\n",
    "\n",
    "# 6. Summarize ruptures per (Account, ISIN)\n",
    "rupture_isin_summary = merged_isin.groupby(\n",
    "    [\"Registrar Account - ID\", \"Product - Isin\"]\n",
    ").agg(\n",
    "    n_ruptures=(\"rupture_flag\", \"sum\"),\n",
    "    obs=(\"rupture_flag\", \"count\"),\n",
    "    rupture_ratio=(\"rupture_flag\", \"mean\"),\n",
    "    max_gap=(\"gap\", lambda x: x.abs().max())\n",
    ").reset_index()\n",
    "\n",
    "# Sort by worst ISIN trajectories\n",
    "rupture_isin_summary = rupture_isin_summary.sort_values(\n",
    "    \"rupture_ratio\", ascending=False\n",
    ")\n",
    "\n",
    "rupture_isin_summary.head(20)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baa4b6cd-887d-45a6-af27-253a9aa8710f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Strong ruptures (ratio > 0.5 OR huge max_gap)\n",
    "strong = rupture_isin_summary[\n",
    "    (rupture_isin_summary[\"rupture_ratio\"] > 0.5)\n",
    "    | (rupture_isin_summary[\"max_gap\"] > 50000)\n",
    "]\n",
    "\n",
    "def find_successors(account_id, isin, window_days=15):\n",
    "    # Extract rupture dates for (account_id, isin)\n",
    "    ruptures = merged_isin[\n",
    "        (merged_isin[\"Registrar Account - ID\"] == account_id)\n",
    "        & (merged_isin[\"Product - Isin\"] == isin)\n",
    "        & (merged_isin[\"rupture_flag\"])\n",
    "    ][\"Centralisation Date\"].unique()\n",
    "\n",
    "    if len(ruptures) == 0:\n",
    "        return []\n",
    "\n",
    "    candidates = []\n",
    "\n",
    "    for rupture_date in ruptures:\n",
    "        start = rupture_date - pd.Timedelta(days=window_days)\n",
    "        end = rupture_date + pd.Timedelta(days=window_days)\n",
    "\n",
    "        # Look for accounts with strong positive jump at the same time\n",
    "        window_df = merged_isin[\n",
    "            (merged_isin[\"Centralisation Date\"] >= start)\n",
    "            & (merged_isin[\"Centralisation Date\"] <= end)\n",
    "            & (merged_isin[\"Product - Isin\"] == isin)\n",
    "        ]\n",
    "\n",
    "        # Look for positive gap (jump)\n",
    "        pos_jumps = window_df[window_df[\"gap\"] > 0]\n",
    "\n",
    "        candidates.extend(pos_jumps[\"Registrar Account - ID\"].unique())\n",
    "\n",
    "    # Remove self\n",
    "    candidates = [c for c in candidates if c != account_id]\n",
    "\n",
    "    return list(set(candidates))\n",
    "\n",
    "find_successors(\"200129601\", \"FR0010135103\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b834da2-f781-476d-84a6-aebb38fb8dac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter année / mois\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# 1. Nombre total de lignes par mois\n",
    "total = df.groupby([\"year\", \"month\"]).size().reset_index(name=\"total_lines\")\n",
    "\n",
    "# 2. Nombre de ruptures par mois\n",
    "ruptures = df[df[\"rupture_flag\"]].groupby([\"year\", \"month\"]).size().reset_index(name=\"n_ruptures\")\n",
    "\n",
    "# 3. Merge pour obtenir total + ruptures\n",
    "ratio = total.merge(ruptures, on=[\"year\",\"month\"], how=\"left\")\n",
    "ratio[\"n_ruptures\"] = ratio[\"n_ruptures\"].fillna(0)\n",
    "\n",
    "# 4. Proportion (en %)\n",
    "ratio[\"rupture_ratio\"] = ratio[\"n_ruptures\"] / ratio[\"total_lines\"]\n",
    "\n",
    "# 5. Pivot pour heatmap\n",
    "heatmap_ratio = ratio.pivot(index=\"year\", columns=\"month\", values=\"rupture_ratio\").fillna(0)\n",
    "\n",
    "# 6. Plot\n",
    "plt.figure(figsize=(14, 7))\n",
    "sns.heatmap(\n",
    "    heatmap_ratio, \n",
    "    cmap=\"Reds\",\n",
    "    linewidths=.3,\n",
    "    linecolor=\"grey\",\n",
    "    annot=True,\n",
    "    fmt=\".2%\",\n",
    "    cbar_kws={'label': 'Proportion de ruptures'}\n",
    ")\n",
    "\n",
    "plt.title(\"Heatmap de la proportion de ruptures (par année et mois)\", fontsize=16)\n",
    "plt.xlabel(\"Mois\")\n",
    "plt.ylabel(\"Année\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa5862ab-ec8e-47f8-8cb0-cd51503efed8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = merged_isin.copy()\n",
    "\n",
    "# Ajouter year / month au cas où\n",
    "df[\"year\"] = df[\"Centralisation Date\"].dt.year\n",
    "df[\"month\"] = df[\"Centralisation Date\"].dt.month\n",
    "\n",
    "# Merge géographique\n",
    "df = df.merge(\n",
    "    geo[[\"Registrar Account - ID\", \"country\"]],\n",
    "    on=\"Registrar Account - ID\",\n",
    "    how=\"left\"\n",
    ")\n",
    "\n",
    "df[\"country\"] = df[\"country\"].fillna(\"UNKNOWN\")\n",
    "\n",
    "# Total des lignes par pays\n",
    "total_country = df.groupby(\"country\").size().reset_index(name=\"total_obs\")\n",
    "\n",
    "# Nombre de ruptures\n",
    "rupt_country = (\n",
    "    df[df[\"rupture_flag\"]]\n",
    "    .groupby(\"country\")\n",
    "    .size()\n",
    "    .reset_index(name=\"ruptures\")\n",
    ")\n",
    "\n",
    "# Merge + ratios\n",
    "country_stats = total_country.merge(rupt_country, on=\"country\", how=\"left\")\n",
    "country_stats[\"ruptures\"] = country_stats[\"ruptures\"].fillna(0)\n",
    "country_stats[\"rupture_ratio\"] = country_stats[\"ruptures\"] / country_stats[\"total_obs\"]\n",
    "\n",
    "# Tri (rupture ratio décroissant)\n",
    "country_stats = country_stats.sort_values(\"rupture_ratio\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86d2a91c-d8d8-416c-8dc4-dc3f4ae7ca90",
   "metadata": {},
   "outputs": [],
   "source": [
    "# On ajoute une colonne en % pour l’affichage\n",
    "country_stats_plot = country_stats.copy()\n",
    "country_stats_plot[\"rupture_pct\"] = country_stats_plot[\"rupture_ratio\"] * 100\n",
    "\n",
    "# Tri décroissant par proportion de ruptures\n",
    "country_stats_plot = country_stats_plot.sort_values(\"rupture_ratio\", ascending=False)\n",
    "\n",
    "fig = px.bar(\n",
    "    country_stats_plot,\n",
    "    x=\"country\",\n",
    "    y=\"rupture_ratio\",\n",
    "    hover_data={\n",
    "        \"rupture_pct\": ':.2f',\n",
    "        \"ruptures\": True,\n",
    "        \"total_obs\": True,\n",
    "        \"rupture_ratio\": False,  # on cache la version décimale\n",
    "    },\n",
    "    labels={\n",
    "        \"country\": \"Pays\",\n",
    "        \"rupture_ratio\": \"Proportion de ruptures\",\n",
    "        \"rupture_pct\": \"% de ruptures\",\n",
    "        \"ruptures\": \"Nb de ruptures\",\n",
    "        \"total_obs\": \"Nb d'observations\"\n",
    "    },\n",
    "    title=\"Proportion de ruptures par pays (avec volumes au survol)\"\n",
    ")\n",
    "\n",
    "# Format en %\n",
    "fig.update_yaxes(tickformat=\".1%\")\n",
    "\n",
    "fig.update_layout(\n",
    "    xaxis_tickangle=-45,\n",
    "    bargap=0.2\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1c114db-5fbd-4cd3-a897-b9d4c96053fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['country']=='JAPAN'].to_csv('Japan.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fec57f0-dd80-47bc-aacb-518c0ac0a4f6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95bc353d-e883-4989-aaca-1b3c9b51ee5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "rs = rupture_summary.copy()\n",
    "\n",
    "# 1. Stats numériques classiques\n",
    "print(\"\\n=== BASIC NUMERIC STATS ===\")\n",
    "print(rs[\"rupture_ratio\"].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))\n",
    "\n",
    "\n",
    "# 2. Distribution par classes (bins)\n",
    "\n",
    "rs[\"rupture_bucket\"] = pd.cut(\n",
    "    rs[\"rupture_ratio\"],\n",
    "    bins=[0, 0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 1.01],\n",
    "    labels=[\n",
    "        \"0–0.1%\",\n",
    "        \"0.1–1%\",\n",
    "        \"1–5%\",\n",
    "        \"5–10%\",\n",
    "        \"10–25%\",\n",
    "        \"25–50%\",\n",
    "        \"50–100%\"\n",
    "    ],\n",
    "    include_lowest=True\n",
    ")\n",
    "\n",
    "# Ajouter la catégorie \"0%\"\n",
    "rs[\"rupture_bucket\"] = rs[\"rupture_bucket\"].cat.add_categories(\"0%\")\n",
    "\n",
    "# Remplacer les 0% exacts\n",
    "rs.loc[rs[\"rupture_ratio\"] == 0, \"rupture_bucket\"] = \"0%\"\n",
    "\n",
    "bucket_counts = rs[\"rupture_bucket\"].value_counts().sort_index()\n",
    "print(bucket_counts)\n",
    "\n",
    "\n",
    "# 3. Pourcentages\n",
    "bucket_percent = (bucket_counts / len(rs) * 100).round(2)\n",
    "\n",
    "print(\"\\n=== DISTRIBUTION (PERCENT) ===\")\n",
    "print(bucket_percent)\n",
    "\n",
    "\n",
    "# 4. Nombre de comptes totalement propres\n",
    "no_rupture = (rs[\"n_ruptures\"] == 0).sum()\n",
    "print(f\"\\nComptes avec 0 rupture = {no_rupture} ({no_rupture/len(rs)*100:.2f}%)\")\n",
    "\n",
    "# 5. Comptes extrêmement problématiques\n",
    "severe = (rs[\"rupture_ratio\"] > 0.75).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 75% = {severe} ({severe/len(rs)*100:.2f}%)\")\n",
    "\n",
    "medium = (rs[\"rupture_ratio\"] > 0.10).sum()\n",
    "print(f\"Comptes avec rupture_ratio > 10% = {medium} ({medium/len(rs)*100:.2f}%)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86d8fe0e-fa6c-46df-bb4c-054d1a677b38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "\n",
    "fig = px.histogram(\n",
    "    rs,\n",
    "    x=\"rupture_ratio\",\n",
    "    nbins=50,\n",
    "    title=\"Distribution du rupture_ratio\",\n",
    "    labels={\"rupture_ratio\": \"Rupture Ratio\"},\n",
    ")\n",
    "fig.update_layout(bargap=0.05)\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "425b36d0-c92a-4405-be28-35b1fc292fec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. Filtres de base ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# Filtrer uniquement l'année 2021\n",
    "ruptures_2021 = merged[(merged[\"year\"] == 2021) & (merged[\"rupture_flag\"] == True)].copy()\n",
    "\n",
    "print(\"Nombre total de ruptures en 2021 :\", len(ruptures_2021))\n",
    "\n",
    "# --- 2. Classification du type de gap ---\n",
    "ruptures_2021[\"gap_type\"] = np.where(ruptures_2021[\"gap\"] > 0, \"positive\", \"negative\")\n",
    "\n",
    "# --- 3. Statistiques globales ---\n",
    "gap_counts = ruptures_2021[\"gap_type\"].value_counts()\n",
    "gap_percent = ruptures_2021[\"gap_type\"].value_counts(normalize=True) * 100\n",
    "\n",
    "print(\"\\n=== RUPTURES 2021 — POSITIVES vs NEGATIVES ===\")\n",
    "print(gap_counts)\n",
    "print(\"\\n(%)\")\n",
    "print(gap_percent.map(lambda x: f\"{x:.2f}%\"))\n",
    "\n",
    "# --- 4. Intensité des écarts ---\n",
    "intensity_stats = ruptures_2021.groupby(\"gap_type\")[\"gap\"].describe()\n",
    "print(\"\\n=== STATISTIQUES DES GAPS ===\")\n",
    "print(intensity_stats)\n",
    "\n",
    "# --- 5. Visualisation rapide ---\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.histplot(data=ruptures_2021, x=\"gap\", hue=\"gap_type\", bins=80, kde=True)\n",
    "plt.xlim(-merged[\"gap\"].abs().max(), merged[\"gap\"].abs().max())\n",
    "plt.title(\"Distribution des gaps de rupture en 2021\")\n",
    "plt.xlabel(\"Gap (AUM_{t} − Expected AUM_{t})\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df9e0005-93f2-4885-baef-2e54921a42f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. ADD YEAR ---\n",
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "# --- 2. DEFINE PERIODS ---\n",
    "conditions = [\n",
    "    merged[\"year\"] < 2021,\n",
    "    merged[\"year\"] == 2021,\n",
    "    merged[\"year\"] > 2021\n",
    "]\n",
    "\n",
    "period_labels = [\"before_2021\", \"during_2021\", \"after_2021\"]\n",
    "\n",
    "merged[\"period\"] = np.select(\n",
    "    conditions,\n",
    "    period_labels,\n",
    "    default=\"unknown\"\n",
    ")\n",
    "\n",
    "# --- 3. CREATE GAP TYPE & FILTER ONLY RUPTURES ---\n",
    "merged[\"gap_type\"] = np.where(\n",
    "    merged[\"gap\"] > 0, \"positive\",\n",
    "    np.where(merged[\"gap\"] < 0, \"negative\", \"zero\")\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 4. TOTAL OBS PER PERIOD ---\n",
    "total_obs = merged.groupby(\"period\").size().rename(\"total_obs\")\n",
    "\n",
    "# --- 5. TOTAL RUPTURES PER PERIOD ---\n",
    "rupture_counts = ruptures.groupby(\"period\").size().rename(\"rupture_count\")\n",
    "\n",
    "# --- 6. PROPORTION OF RUPTURES ---\n",
    "rupture_ratio = (rupture_counts / total_obs).rename(\"rupture_ratio\")\n",
    "\n",
    "# --- 7. POSITIVE / NEGATIVE GAPS (% among ruptures) ---\n",
    "gap_dist = (\n",
    "    ruptures.groupby([\"period\", \"gap_type\"])\n",
    "    .size()\n",
    "    .groupby(level=0)\n",
    "    .apply(lambda x: (x / x.sum()) * 100)   # % par période\n",
    ")\n",
    "\n",
    "\n",
    "# --- 8. MERGE AND DISPLAY ---\n",
    "summary = pd.concat([total_obs, rupture_counts, rupture_ratio], axis=1)\n",
    "summary[\"rupture_ratio\"] = (summary[\"rupture_ratio\"] * 100).round(2)\n",
    "\n",
    "print(\"\\n=== RUPTURE SUMMARY (in %) ===\")\n",
    "print(summary)\n",
    "\n",
    "print(\"\\n=== GAP POSITIVE / NEGATIVE DISTRIBUTION (in %) ===\")\n",
    "print(gap_dist)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "195205bd-d817-41f9-a0fd-18d8b804515f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- 1. DEFINE PERIODS ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 2. Ensure gap_type exists + no missing categories ---\n",
    "ruptures[\"gap_type\"] = ruptures[\"gap_type\"].replace({\"zero\": \"positive\"})  # zero is equivalent to no-flow change\n",
    "\n",
    "# --- 3. Compute gap counts ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 4. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 5. MAKE TWO DONUT CHARTS ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before Sep 2021\", \"After Sep 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.45,\n",
    "        textinfo=\"label+percent\"\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Nature des ruptures (positive / negative)\\nAvant vs Après Septembre 2021\",\n",
    "    showlegend=True\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9583f188-8601-425b-908f-61c2ee1f8da2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Compute gap counts by period ---\n",
    "gap_counts = (\n",
    "    ruptures.groupby([\"period2\", \"gap_type\"])\n",
    "    .size()\n",
    "    .unstack(fill_value=0)\n",
    ")\n",
    "\n",
    "# Ensure both columns exist\n",
    "for col in [\"positive\", \"negative\"]:\n",
    "    if col not in gap_counts.columns:\n",
    "        gap_counts[col] = 0\n",
    "\n",
    "gap_counts = gap_counts[[\"positive\", \"negative\"]]\n",
    "\n",
    "# --- 2. Extract values ---\n",
    "before_vals = gap_counts.loc[\"Before Sep 2021\"].values\n",
    "after_vals  = gap_counts.loc[\"After Sep 2021\"].values\n",
    "\n",
    "# --- 3. Plot : TWO PIE CHARTS side by side ---\n",
    "fig = make_subplots(\n",
    "    rows=1, cols=2,\n",
    "    specs=[[{\"type\": \"pie\"}, {\"type\": \"pie\"}]],\n",
    "    subplot_titles=(\"Before 2021\", \"After 2021\")\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=before_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Pie(\n",
    "        labels=[\"Negative gaps\", \"Positive gaps\"],\n",
    "        values=after_vals,\n",
    "        marker_colors=[\"#E67E22\", \"#3498DB\"],\n",
    "        hole=0.35\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures (positive / negative)\\nAvant vs Après 2021\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4e29536-eeed-4b91-a59c-b373cf14a5fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- 1. Define periods ---\n",
    "merged[\"period2\"] = np.where(\n",
    "    merged[\"Centralisation Date\"] < pd.Timestamp(\"2021-09-01\"),\n",
    "    \"Before Sep 2021\",\n",
    "    \"After Sep 2021\"\n",
    ")\n",
    "\n",
    "# --- 2. Keep only ruptures ---\n",
    "ruptures = merged[merged[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# --- 3. Count ruptures per period ---\n",
    "rupture_counts = ruptures[\"period2\"].value_counts().reindex(\n",
    "    [\"Before Sep 2021\", \"After Sep 2021\"]\n",
    ").fillna(0)\n",
    "\n",
    "# --- 4. Pie chart ---\n",
    "fig = go.Figure(data=[\n",
    "    go.Pie(\n",
    "        labels=rupture_counts.index,\n",
    "        values=rupture_counts.values,\n",
    "        hole=0.45,\n",
    "        marker_colors=[\"#2ECC71\", \"#E74C3C\"],\n",
    "        textinfo=\"percent+value\",\n",
    "    )\n",
    "])\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Répartition des ruptures\"\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "553f91fa-5017-4685-ab31-afe2aa247e13",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# 1. Filtre sur la période post-Sept 2021\n",
    "cutoff = pd.Timestamp(\"2021-09-01\")\n",
    "post = merged[merged[\"Centralisation Date\"] >= cutoff].copy()\n",
    "\n",
    "# 2. On ne garde que les ruptures\n",
    "post_rupt = post[post[\"rupture_flag\"] == True].copy()\n",
    "\n",
    "# 3. Gap absolu + gap relatif (% du stock)\n",
    "post_rupt[\"gap_abs\"] = post_rupt[\"gap\"].abs()\n",
    "post_rupt[\"gap_rel\"] = post_rupt[\"gap_abs\"] / post_rupt[\"Quantity - AUM\"].replace(0, np.nan)\n",
    "\n",
    "# 4. Percentiles globaux\n",
    "p90 = post_rupt[\"gap_abs\"].quantile(0.90)\n",
    "p95 = post_rupt[\"gap_abs\"].quantile(0.95)\n",
    "p99 = post_rupt[\"gap_abs\"].quantile(0.99)\n",
    "\n",
    "# 5. Classification automatique\n",
    "def classify_gap(gap, gap_rel, acct):\n",
    "    # RESET → énorme choc (technique)\n",
    "    if gap_abs >= p99 or gap_rel >= 0.90:\n",
    "        return \"reset\"\n",
    "\n",
    "    # SPIKE → très gros gap mais isolé\n",
    "    if gap_abs >= p95:\n",
    "        return \"spike\"\n",
    "\n",
    "    # SHIFT → décalage permanent\n",
    "    # Test : moyenne des gaps du compte\n",
    "    return None\n",
    "\n",
    "# Calcul du shift (décalage directionnel)\n",
    "shift_info = post_rupt.groupby(\"Registrar Account - ID\")[\"gap\"].mean().rename(\"avg_gap\")\n",
    "\n",
    "post_rupt = post_rupt.merge(shift_info, on=\"Registrar Account - ID\", how=\"left\")\n",
    "\n",
    "post_rupt[\"gap_type2\"] = np.where(\n",
    "    post_rupt[\"gap_abs\"] >= p99, \"reset\",\n",
    "    np.where(post_rupt[\"gap_abs\"] >= p95, \"spike\",\n",
    "    np.where(post_rupt[\"avg_gap\"].abs() > post_rupt[\"gap_abs\"].median(), \"shift\", \"micro\")))\n",
    " \n",
    "# 6. Statistiques globales\n",
    "stats = post_rupt[\"gap_type2\"].value_counts(normalize=True).round(3) * 100\n",
    "print(\"\\n=== DISTRIBUTION DES TYPES DE GAPS POST-2021 ===\")\n",
    "print(stats)\n",
    "\n",
    "# 7. Stats par client\n",
    "client_stats = (\n",
    "    post_rupt.groupby(\"Registrar Account - ID\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "# 8. Stats par ISIN\n",
    "isin_stats = (\n",
    "    post_rupt.groupby(\"Product - Isin\")[\"gap_type2\"]\n",
    "    .value_counts(normalize=True)\n",
    "    .rename(\"ratio\")\n",
    "    .mul(100)\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "print(\"\\n=== TOP ISIN PAR RESET ===\")\n",
    "print(isin_stats[isin_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n",
    "\n",
    "print(\"\\n=== TOP CLIENTS PAR RESET ===\")\n",
    "print(client_stats[client_stats[\"gap_type2\"]==\"reset\"].sort_values(\"ratio\", ascending=False).head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4396f2c-a8b6-4ea9-8292-093f900bf260",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "# --- Data from your output ---\n",
    "labels = [\"Micro-ruptures\", \"Décalage\", \"Anomalies ponctuelles\", \"Remise à zéro\"]\n",
    "values = [50.4, 44.6, 4.0, 1.0]\n",
    "\n",
    "# --- Pie chart ---\n",
    "fig = go.Figure(\n",
    "    data=[go.Pie(\n",
    "        labels=labels,\n",
    "        values=values,\n",
    "        hole=0.35,                  # donut style (plus lisible)\n",
    "        textinfo='percent',\n",
    "        marker=dict(colors=[\"#3498DB\", \"#E67E22\", \"#9B59B6\", \"#E74C3C\"])\n",
    "    )]\n",
    ")\n",
    "\n",
    "fig.update_layout(\n",
    "    title=\"Typologie des ruptures depuis Septembre 2021\",\n",
    "    legend_title=\"Type de gap\",\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3df1f839-44d4-4894-bdfa-6851971d1983",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged[\"year\"] = merged[\"Centralisation Date\"].dt.year\n",
    "\n",
    "yearly_stats = merged.groupby(\"year\").agg(\n",
    "    total_obs=(\"gap\", \"count\"),\n",
    "    ruptures=(\"rupture_flag\", \"sum\")\n",
    ").reset_index()\n",
    "\n",
    "yearly_stats[\"rupture_rate\"] = (\n",
    "    yearly_stats[\"ruptures\"] / yearly_stats[\"total_obs\"]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f312684d-0815-439c-a632-cadd1cbb779c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}