{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2e8cf88b-cecf-409f-9c2d-c3762b233f05", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n", "Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install openpyxl" ] }, { "cell_type": "code", "execution_count": 2, "id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "id": "ff2261fb-9516-4410-b42d-3acc8dc1a460", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'N1DBJCHI7YTK9AVMG6XT'\n", "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'SRCPMh8a1eQxX6Z09GeDxZoD55MBpnkJzyBctLII'\n", "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJOMURCSkNISTdZVEs5QVZNRzZYVCIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2MzEzMTgzNiwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjQzNDE0MzksImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2MzEzMTgzOCwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiJkY2I2MWJiZi1lZjU4LTRhMTItOGYyZS1jYTI0ZmUyNTA2YzEiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6ImQxMDI0NGVlLWE3ZDMtNDA5MC04ZDA3LWNlOWY3YjM5MDRkNCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.sLXOE8w930_dXU0yNWroaDvaTvcUUCONMcbgbKeMEduQebXQjOS7gEQxo-I7Q2oqLFb_dhg1zBlwx5VpNjyTMA'\n", "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n", "fs = s3fs.S3FileSystem(\n", " client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n", " key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n", " secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n", " token = os.environ[\"AWS_SESSION_TOKEN\"])" ] }, { "cell_type": "markdown", "id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5", "metadata": {}, "source": [ "# data exploration" ] }, { "cell_type": "markdown", "id": "eaf5c5a0-eb1c-4242-b893-7600e6def109", "metadata": {}, "source": [ "Fonctions utiles" ] }, { "cell_type": "code", "execution_count": 4, "id": "60e2035c-c2f0-4c51-97df-102e67ba96ee", "metadata": {}, "outputs": [], "source": [ "def plot_account(account_id, isin=None):\n", " \"\"\"\n", " Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n", " Optionally, only for one ISIN.\n", " \"\"\"\n", "\n", " df = merged.copy()\n", "\n", " # Filter by account\n", " df = df[df[\"Registrar Account - ID\"] == account_id]\n", "\n", " if isin is not None:\n", " df = df[df[\"Product - Isin\"] == isin]\n", "\n", " if df.empty:\n", " print(f\"No data found for account {account_id}\")\n", " return\n", "\n", " df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n", "\n", " df_plot = df_plot.sort_values(\"Centralisation Date\")\n", "\n", " # Plot\n", " plt.figure(figsize=(12, 4))\n", " plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n", " plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n", " plt.xlabel(\"Date\")\n", " plt.ylabel(\"Total AUM\")\n", " plt.grid(True)\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "37e008b1-32d4-44be-9d23-1b90a5a26f89", "metadata": {}, "outputs": [], "source": [ "# 2. BASIC INSPECTION\n", "\n", "def quick_info(df, name):\n", " print(\"\\n\" + \"=\"*80)\n", " print(f\"DATASET : {name}\")\n", " print(\"=\"*80)\n", " print(\"\\nShape :\", df.shape)\n", " print(\"\\nColumns :\", df.columns.tolist())\n", " print(\"\\nDtypes :\\n\", df.dtypes)\n", " print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n", " print(\"\\nSample rows:\\n\", df.head(5))\n", " print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": 7, "id": "e104a416-4cfd-43b9-b9ec-6af1fce700da", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "os.environ[\"AWS_ACCESS_KEY_ID\"] = 'R9J6QLRZDYWLSWKBJ7IA'\n", "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = 'sFi4J78xigTJEXdY02bQL2i5KNwyYL7VZpMe0XJs'\n", "os.environ[\"AWS_SESSION_TOKEN\"] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJSOUo2UUxSWkRZV0xTV0tCSjdJQSIsImFjciI6IjAiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJhdWQiOlsibWluaW8iLCJhY2NvdW50Il0sImF1dGhfdGltZSI6MTc2NDM0Mzc0MSwiYXpwIjoib255eGlhLW1pbmlvIiwiZW1haWwiOiJzYXJhaC50aG91bXlyZUBlbnNhZS5mciIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3NjU1NTM4NjYsImZhbWlseV9uYW1lIjoiVEhPVU1ZUkUiLCJnaXZlbl9uYW1lIjoiU2FyYWgiLCJncm91cHMiOlsiYmRjLWRhdGEiLCJiZGMtY2FybWlnbmFjLWczIl0sImlhdCI6MTc2NDM0NDI2NiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmdyb3VwZS1nZW5lcy5mci9yZWFsbXMvZ2VuZXMiLCJqdGkiOiIxZWEzOTgwMi0zZGNiLTRlMzEtYTkwMS1jNGE4M2Q5ZjQyYzQiLCJuYW1lIjoiU2FyYWggVEhPVU1ZUkUiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoic3Rob3VteXJlLWVuc2FlIiwicmVhbG1fYWNjZXNzIjp7InJvbGVzIjpbIm9mZmxpbmVfYWNjZXNzIiwiZGVmYXVsdC1yb2xlcy1nZW5lcyIsInVtYV9hdXRob3JpemF0aW9uIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInNpZCI6IjZlMDBhN2QxLTMxYWQtNGE3Ny04ZmE3LTBlYTc1Y2VhZTQwMCIsInN1YiI6ImVhYWVkN2QyLWM4MjYtNGIxNC05MzczLTYwYjNhODhlMWFiNiIsInR5cCI6IkJlYXJlciJ9.gUxGEq4iWA5eJVWYj-JyDZGYzTnWmQH92iYU-2_5P4n3erGFFZvz8wJe5keTAhcKKTycZKiWltnZpCNsRQ0vOg'\n", "os.environ[\"AWS_DEFAULT_REGION\"] = 'us-east-1'\n", "fs = s3fs.S3FileSystem(\n", " client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n", " key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n", " secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n", " token = os.environ[\"AWS_SESSION_TOKEN\"])" ] }, { "cell_type": "code", "execution_count": 8, "id": "e67a99ea-ddf4-4627-8f48-ec183c671acb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1219/19230119.py:2: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " flows = pd.read_csv(f, sep=\";\")\n", "/tmp/ipykernel_1219/19230119.py:5: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " stocks = pd.read_csv(f, sep=\";\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "================================================================================\n", "DATASET : STOCKS\n", "================================================================================\n", "\n", "Shape : (4880297, 18)\n", "\n", "Columns : ['Agreement - Code', 'Company - Id', 'Company - Ultimate Parent Id', 'Registrar Account - ID', 'Registrar Account - Region', 'RegistrarAccount - Country', 'Product - Asset Type', 'Product - Strategy', 'Product - Legal Status', 'Product - Is Dedie ?', 'Product - Fund', 'Product - Shareclass Type', 'Product - Shareclass Currency', 'Product - Isin', 'Centralisation Date', 'Quantity - AUM', 'Value - AUM CCY', 'Value - AUM €']\n", "\n", "Dtypes :\n", " Agreement - Code object\n", "Company - Id object\n", "Company - Ultimate Parent Id object\n", "Registrar Account - ID object\n", "Registrar Account - Region object\n", "RegistrarAccount - Country object\n", "Product - Asset Type object\n", "Product - Strategy object\n", "Product - Legal Status object\n", "Product - Is Dedie ? object\n", "Product - Fund object\n", "Product - Shareclass Type object\n", "Product - Shareclass Currency object\n", "Product - Isin object\n", "Centralisation Date object\n", "Quantity - AUM float64\n", "Value - AUM CCY float64\n", "Value - AUM € float64\n", "dtype: object\n", "\n", "Missing values (%) :\n", " Product - Asset Type 6.471553\n", "Company - Id 2.330801\n", "Company - Ultimate Parent Id 2.330801\n", "Product - Strategy 0.001537\n", "Product - Shareclass Type 0.000717\n", "Agreement - Code 0.000000\n", "RegistrarAccount - Country 0.000000\n", "Registrar Account - Region 0.000000\n", "Product - Legal Status 0.000000\n", "Registrar Account - ID 0.000000\n", "Product - Is Dedie ? 0.000000\n", "Product - Fund 0.000000\n", "Product - Shareclass Currency 0.000000\n", "Product - Isin 0.000000\n", "Centralisation Date 0.000000\n", "Quantity - AUM 0.000000\n", "Value - AUM CCY 0.000000\n", "Value - AUM € 0.000000\n", "dtype: float64\n", "\n", "Sample rows:\n", " Agreement - Code Company - Id Company - Ultimate Parent Id \\\n", "0 3 166.0 166.0 \n", "1 3 166.0 166.0 \n", "2 3 166.0 166.0 \n", "3 3 166.0 166.0 \n", "4 3 166.0 166.0 \n", "\n", " Registrar Account - ID Registrar Account - Region \\\n", "0 200000647 France \n", "1 200000647 France \n", "2 200000647 France \n", "3 200000647 France \n", "4 200000647 France \n", "\n", " RegistrarAccount - Country Product - Asset Type Product - Strategy \\\n", "0 France Diversified Patrimoine \n", "1 France Diversified Patrimoine \n", "2 France Diversified Patrimoine \n", "3 France Diversified Patrimoine \n", "4 France Diversified Patrimoine \n", "\n", " Product - Legal Status Product - Is Dedie ? Product - Fund \\\n", "0 FCP NO Carmignac Patrimoine \n", "1 FCP NO Carmignac Patrimoine \n", "2 FCP NO Carmignac Patrimoine \n", "3 FCP NO Carmignac Patrimoine \n", "4 FCP NO Carmignac Patrimoine \n", "\n", " Product - Shareclass Type Product - Shareclass Currency Product - Isin \\\n", "0 A EUR FR0010135103 \n", "1 A EUR FR0010135103 \n", "2 A EUR FR0010135103 \n", "3 A EUR FR0010135103 \n", "4 A EUR FR0010135103 \n", "\n", " Centralisation Date Quantity - AUM Value - AUM CCY Value - AUM € \n", "0 2015-03-31 35.368 24648.6666 24648.6666 \n", "1 2015-11-30 35.368 22413.0553 22413.0553 \n", "2 2015-12-31 35.368 22051.2406 22051.2406 \n", "3 2016-03-31 35.368 21626.1173 21626.1173 \n", "4 2016-11-30 35.368 22489.4502 22489.4502 \n", "\n", "Unique values per column:\n", " Value - AUM € 1697923\n", "Value - AUM CCY 1689620\n", "Quantity - AUM 554404\n", "Registrar Account - ID 15532\n", "Agreement - Code 2521\n", "Company - Id 1970\n", "Company - Ultimate Parent Id 1392\n", "Product - Isin 491\n", "Centralisation Date 130\n", "Product - Fund 74\n", "Product - Strategy 52\n", "RegistrarAccount - Country 39\n", "Registrar Account - Region 15\n", "Product - Shareclass Type 11\n", "Product - Legal Status 6\n", "Product - Shareclass Currency 6\n", "Product - Asset Type 5\n", "Product - Is Dedie ? 2\n", "dtype: int64\n", "\n", "================================================================================\n", "DATASET : FLOWS\n", "================================================================================\n", "\n", "Shape : (2574461, 24)\n", "\n", "Columns : ['Agreement - Code', 'Company - Id', 'Company - Ultimate Parent Id', 'Registrar Account - ID', 'Registrar Account - Region', 'RegistrarAccount - Country', 'Product - Asset Type', 'Product - Strategy', 'Product - Legal Status', 'Product - Is Dedie ?', 'Product - Fund', 'Product - Shareclass Type', 'Product - Shareclass Currency', 'Product - Isin', 'Centralisation Date', 'Quantity - Subscription', 'Quantity - Redemption', 'Quantity - NetFlows', 'Value Ccy - Subscription', 'Value Ccy - Redemption', 'Value Ccy - NetFlows', 'Value € - Subscription', 'Value € - Redemption', 'Value € - NetFlows']\n", "\n", "Dtypes :\n", " Agreement - Code object\n", "Company - Id object\n", "Company - Ultimate Parent Id object\n", "Registrar Account - ID object\n", "Registrar Account - Region object\n", "RegistrarAccount - Country object\n", "Product - Asset Type object\n", "Product - Strategy object\n", "Product - Legal Status object\n", "Product - Is Dedie ? object\n", "Product - Fund object\n", "Product - Shareclass Type object\n", "Product - Shareclass Currency object\n", "Product - Isin object\n", "Centralisation Date object\n", "Quantity - Subscription float64\n", "Quantity - Redemption float64\n", "Quantity - NetFlows float64\n", "Value Ccy - Subscription float64\n", "Value Ccy - Redemption float64\n", "Value Ccy - NetFlows float64\n", "Value € - Subscription float64\n", "Value € - Redemption float64\n", "Value € - NetFlows float64\n", "dtype: object\n", "\n", "Missing values (%) :\n", " Product - Asset Type 0.079589\n", "Company - Id 0.059818\n", "Company - Ultimate Parent Id 0.059818\n", "Product - Strategy 0.000233\n", "Product - Shareclass Type 0.000078\n", "Registrar Account - ID 0.000000\n", "RegistrarAccount - Country 0.000000\n", "Agreement - Code 0.000000\n", "Registrar Account - Region 0.000000\n", "Product - Legal Status 0.000000\n", "Product - Is Dedie ? 0.000000\n", "Product - Fund 0.000000\n", "Product - Shareclass Currency 0.000000\n", "Product - Isin 0.000000\n", "Centralisation Date 0.000000\n", "Quantity - Subscription 0.000000\n", "Quantity - Redemption 0.000000\n", "Quantity - NetFlows 0.000000\n", "Value Ccy - Subscription 0.000000\n", "Value Ccy - Redemption 0.000000\n", "Value Ccy - NetFlows 0.000000\n", "Value € - Subscription 0.000000\n", "Value € - Redemption 0.000000\n", "Value € - NetFlows 0.000000\n", "dtype: float64\n", "\n", "Sample rows:\n", " Agreement - Code Company - Id Company - Ultimate Parent Id \\\n", "0 003 166 166 \n", "1 003 166 166 \n", "2 003 166 166 \n", "3 003 166 166 \n", "4 003 166 166 \n", "\n", " Registrar Account - ID Registrar Account - Region \\\n", "0 200127202 France \n", "1 406533 France \n", "2 406533 France \n", "3 406533 France \n", "4 406533 France \n", "\n", " RegistrarAccount - Country Product - Asset Type Product - Strategy \\\n", "0 France Equity Investissement \n", "1 France Diversified Patrimoine \n", "2 France Equity Investissement \n", "3 France Equity Investissement \n", "4 France Equity Investissement \n", "\n", " Product - Legal Status Product - Is Dedie ? ... Centralisation Date \\\n", "0 SICAV NO ... 2020-11-05 \n", "1 FCP NO ... 2015-03-09 \n", "2 FCP NO ... 2016-10-26 \n", "3 FCP NO ... 2018-10-18 \n", "4 FCP NO ... 2019-04-08 \n", "\n", " Quantity - Subscription Quantity - Redemption Quantity - NetFlows \\\n", "0 1636.00 0.000 1636.000 \n", "1 144.69 0.000 144.690 \n", "2 0.00 -8.321 -8.321 \n", "3 0.00 -22.083 -22.083 \n", "4 0.00 -465.992 -465.992 \n", "\n", " Value Ccy - Subscription Value Ccy - Redemption Value Ccy - NetFlows \\\n", "0 280983.00 0.00 280983.00 \n", "1 99985.13 0.00 99985.13 \n", "2 0.00 -9384.76 -9384.76 \n", "3 0.00 -25227.40 -25227.40 \n", "4 0.00 -563775.76 -563775.76 \n", "\n", " Value € - Subscription Value € - Redemption Value € - NetFlows \n", "0 280983.00 0.00 280983.00 \n", "1 99985.13 0.00 99985.13 \n", "2 0.00 -9384.76 -9384.76 \n", "3 0.00 -25227.40 -25227.40 \n", "4 0.00 -563775.76 -563775.76 \n", "\n", "[5 rows x 24 columns]\n", "\n", "Unique values per column:\n", " Value € - NetFlows 2018916\n", "Value Ccy - NetFlows 1972319\n", "Value € - Redemption 1323531\n", "Value Ccy - Redemption 1296468\n", "Value € - Subscription 955890\n", "Value Ccy - Subscription 926633\n", "Quantity - NetFlows 667586\n", "Quantity - Redemption 374378\n", "Quantity - Subscription 359661\n", "Registrar Account - ID 9805\n", "Centralisation Date 2780\n", "Company - Id 1929\n", "Agreement - Code 1626\n", "Company - Ultimate Parent Id 1283\n", "Product - Isin 474\n", "Product - Fund 70\n", "Product - Strategy 49\n", "RegistrarAccount - Country 34\n", "Registrar Account - Region 15\n", "Product - Shareclass Type 10\n", "Product - Shareclass Currency 6\n", "Product - Legal Status 6\n", "Product - Asset Type 5\n", "Product - Is Dedie ? 2\n", "dtype: int64\n", "\n", "================================================================================\n", "DATASET : NAV/PRICES\n", "================================================================================\n", "\n", "Shape : (30333, 13)\n", "\n", "Columns : ['NavDate', 'LegalForm', 'Cod', 'PortfolioName', 'PTFCurrency', 'PortfolioAum_Eur', 'ShareClassIsin', 'ShareClassName', 'ShareClassCurrency', 'ShareClassPrice', 'NumberOfShares', 'ShareClassAumLocalCur', 'ShareClassAum_EUR']\n", "\n", "Dtypes :\n", " 0\n", "NavDate object\n", "LegalForm object\n", "Cod object\n", "PortfolioName object\n", "PTFCurrency object\n", "PortfolioAum_Eur object\n", "ShareClassIsin object\n", "ShareClassName object\n", "ShareClassCurrency object\n", "ShareClassPrice object\n", "NumberOfShares object\n", "ShareClassAumLocalCur object\n", "ShareClassAum_EUR object\n", "dtype: object\n", "\n", "Missing values (%) :\n", " 0\n", "NavDate 0.0\n", "LegalForm 0.0\n", "Cod 0.0\n", "PortfolioName 0.0\n", "PTFCurrency 0.0\n", "PortfolioAum_Eur 0.0\n", "ShareClassIsin 0.0\n", "ShareClassName 0.0\n", "ShareClassCurrency 0.0\n", "ShareClassPrice 0.0\n", "NumberOfShares 0.0\n", "ShareClassAumLocalCur 0.0\n", "ShareClassAum_EUR 0.0\n", "dtype: float64\n", "\n", "Sample rows:\n", " 0 NavDate LegalForm Cod PortfolioName \\\n", "0 31/12/2009 SICAV CC Carmignac Portfolio Climate Transition \n", "1 31/12/2009 SICAV CFB Carmignac Portfolio Flexible Bond \n", "2 31/12/2009 FCP CCT Carmignac Court Terme \n", "3 31/12/2009 FCP CE Carmignac Emergents \n", "4 31/12/2009 SICAV CAD Carmignac Portfolio Asia Discovery \n", "\n", "0 PTFCurrency PortfolioAum_Eur ShareClassIsin ShareClassName \\\n", "0 EUR 941059600 LU0164455502 A EUR ACC \n", "1 EUR 57063272.31 LU0336084032 A EUR ACC \n", "2 EUR 788828666.5 FR0010149161 A EUR ACC \n", "3 EUR 1508087050 FR0010149302 A EUR ACC \n", "4 EUR 149490224.2 LU0336083810 A EUR ACC \n", "\n", "0 ShareClassCurrency ShareClassPrice NumberOfShares ShareClassAumLocalCur \\\n", "0 EUR 287.21 3276555.83 941059600 \n", "1 EUR 1016.833 56118.62745 57063272.31 \n", "2 EUR 3687.84 213899.9161 788828666.5 \n", "3 EUR 559.82 2693878.478 1508087050 \n", "4 EUR 884.9 168934.5962 149490224.2 \n", "\n", "0 ShareClassAum_EUR \n", "0 941059600 \n", "1 57063272.31 \n", "2 788828666.5 \n", "3 1508087050 \n", "4 149490224.2 \n", "\n", "Unique values per column:\n", " 0\n", "ShareClassAum_EUR 30211\n", "ShareClassAumLocalCur 30032\n", "NumberOfShares 28910\n", "ShareClassPrice 14747\n", "PortfolioAum_Eur 5505\n", "ShareClassIsin 416\n", "NavDate 210\n", "ShareClassName 90\n", "Cod 55\n", "PortfolioName 55\n", "LegalForm 6\n", "ShareClassCurrency 6\n", "PTFCurrency 2\n", "dtype: int64\n" ] } ], "source": [ "with fs.open('projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv', 'rb') as f:\n", " flows = pd.read_csv(f, sep=\";\")\n", "\n", "with fs.open('projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n", " stocks = pd.read_csv(f, sep=\";\")\n", "\n", "with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n", " nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n", "nav = nav_raw[0].str.split(\",\", expand=True)\n", "nav.columns = nav.iloc[0]\n", "nav = nav[1:].reset_index(drop=True)\n", "\n", "quick_info(stocks, \"STOCKS\")\n", "quick_info(flows, \"FLOWS\")\n", "quick_info(nav, \"NAV/PRICES\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Date conversion done.\n", "NAV numeric conversion done.\n", "String normalization done.\n", "\n", "ISIN missing in FLOWS but present in STOCKS : 17\n", "\n", "ISIN missing in STOCKS but present in FLOWS : 0\n", "\n", "ISIN missing in NAV but present in FLOWS : 67\n", "\n", "ISIN missing in NAV but present in STOCKS : 76\n", "\n", "Accounts in STOCKS but NEVER in FLOWS : 5777\n", "\n", "Accounts in FLOWS but NEVER in STOCKS : 118\n", "\n", "CLIENT BEHAVIOR (first 5 rows):\n", " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \n", "0 49.280511 0.000 -109.238 -1.092380e+11 \n", "1 NaN 0.000 -660.115 -6.601150e+11 \n", "2 NaN 0.000 -174.646 -1.746460e+11 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+11 \n", "\n", "FUND BEHAVIOR (first 5 rows):\n", " Product - Isin n_accounts n_days total_netflows vol_flows\n", "0 FR0010135103 2690 2723 -2.571327e+07 2622.609244\n", "1 FR0010147603 733 2719 -2.562187e+06 1206.248205\n", "2 FR0010148981 1841 2722 -3.609440e+06 1051.069183\n", "3 FR0010148999 454 2306 -7.130297e+05 1265.364138\n", "4 FR0010149112 934 2000 -9.438901e+05 1834.961721\n" ] } ], "source": [ "# 1. CLEAN DATES (formats différents)\n", "\n", "stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n", "flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n", "nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n", "\n", "print(\"Date conversion done.\")\n", "\n", "# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n", "\n", "num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n", " \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n", "\n", "for col in num_cols:\n", " nav[col] = (\n", " nav[col]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .str.replace(\" \", \"\")\n", " .astype(float)\n", " )\n", "\n", "print(\"NAV numeric conversion done.\")\n", "\n", "# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n", "\n", "def norm(df):\n", " for col in df.columns:\n", " if df[col].dtype == \"object\":\n", " df[col] = df[col].astype(str).str.strip().str.upper()\n", " return df\n", "\n", "stocks = norm(stocks)\n", "flows = norm(flows)\n", "nav = norm(nav)\n", "\n", "print(\"String normalization done.\")\n", "\n", "\n", "# 4. ANALYSE RELATIONS ACROSS FILES\n", "\n", "# Unique sets\n", "isin_stocks = set(stocks[\"Product - Isin\"].unique())\n", "isin_flows = set(flows[\"Product - Isin\"].unique())\n", "isin_nav = set(nav[\"ShareClassIsin\"].unique())\n", "\n", "print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n", "print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n", "print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n", "print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n", "\n", "\n", "# 5. CLIENTS: STOCKS VS FLOWS\n", "\n", "acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n", "acc_flows = set(flows[\"Registrar Account - ID\"].unique())\n", "\n", "print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n", "print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n", "\n", "\n", "# 6. CLIENT ACTIVITY METRICS (DETAILED)\n", "\n", "client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " n_transactions=(\"Quantity - NetFlows\", \"count\"),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n", " std_flow=(\"Quantity - NetFlows\", \"std\"),\n", " total_subscription=(\"Quantity - Subscription\", \"sum\"),\n", " total_redemption=(\"Quantity - Redemption\", \"sum\")\n", ").reset_index()\n", "\n", "# Add churn metric\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + 1e-9)\n", ")\n", "\n", "print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n", "\n", "\n", "# 7. FUNDS ACTIVITY METRICS\n", "\n", "fund_behavior = flows.groupby(\"Product - Isin\").agg(\n", " n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " vol_flows=(\"Quantity - NetFlows\", \"std\")\n", ").reset_index()\n", "\n", "print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n", "\n", "\n", "# 8. SAVE INTERMEDIATE\n", "\n", "client_behavior.to_csv(\"client_behavior.csv\", index=False)\n", "fund_behavior.to_csv(\"fund_behavior.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 10, "id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FULL usable ISIN : 407\n", "Stocks only ISIN : 17\n", "Flows only ISIN : 0\n", "Missing NAV : 76\n", "All ISIN groups saved into 4 separate files.\n" ] } ], "source": [ "valid_full = isin_stocks & isin_flows & isin_nav\n", "stocks_only = isin_stocks - isin_flows\n", "flows_only = isin_flows - isin_stocks\n", "missing_nav = (isin_stocks | isin_flows) - isin_nav\n", "\n", "print(\"FULL usable ISIN :\", len(valid_full))\n", "print(\"Stocks only ISIN :\", len(stocks_only))\n", "print(\"Flows only ISIN :\", len(flows_only))\n", "print(\"Missing NAV :\", len(missing_nav))\n", "\n", "pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n", "\n", "print(\"All ISIN groups saved into 4 separate files.\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \\\n", "0 49.280511 0.000 -109.238 -1.092380e+08 \n", "1 NaN 0.000 -660.115 -6.601150e+08 \n", "2 NaN 0.000 -174.646 -1.746460e+08 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+08 \n", "\n", " churn_flag activity_score flow_volatility inertia_ratio \n", "0 0 1.386294 49.280511 0.998921 \n", "1 0 0.693147 0.000000 0.999640 \n", "2 0 0.693147 0.000000 0.999640 \n", "3 0 4.304065 2168.971331 0.976619 \n", "4 0 0.693147 0.000000 0.999640 \n" ] } ], "source": [ "eps = 1e-6\n", "\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + eps)\n", ")\n", "\n", "client_behavior[\"churn_flag\"] = (\n", " client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n", ").astype(int)\n", "\n", "client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n", "\n", "client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n", "\n", "client_behavior[\"inertia_ratio\"] = (\n", " 1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n", ")\n", "\n", "print(client_behavior.head())\n", "\n", "client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "8ee7e911-eb73-4846-b545-661140411c1b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1219/1645623303.py:17: RuntimeWarning: invalid value encountered in scalar divide\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_isin_held n_funds_held n_asset_types \\\n", "0 100000014 1 1 1 \n", "1 100000016 2 2 2 \n", "2 100000028 1 1 1 \n", "3 100000038 3 3 2 \n", "4 100000042 1 1 1 \n", "\n", " n_strategies total_aum median_aum concentration_ratio \n", "0 1 0.0000 0.0 NaN \n", "1 2 0.0000 0.0 NaN \n", "2 1 126236.2184 0.0 1.0 \n", "3 3 0.0000 0.0 NaN \n", "4 1 446362.9015 0.0 1.0 \n", " n_isin_held n_funds_held n_asset_types n_strategies total_aum \\\n", "count 12501.000000 12501.000000 12501.000000 12501.000000 1.250100e+04 \n", "mean 5.514759 4.408367 2.082473 4.109271 4.218474e+08 \n", "std 10.434698 5.472756 1.254048 4.714800 5.618341e+09 \n", "min 1.000000 1.000000 1.000000 1.000000 -2.586805e+08 \n", "25% 1.000000 1.000000 1.000000 1.000000 0.000000e+00 \n", "50% 2.000000 2.000000 2.000000 2.000000 2.587605e+05 \n", "75% 6.000000 5.000000 3.000000 5.000000 8.817014e+06 \n", "max 469.000000 67.000000 6.000000 48.000000 4.780234e+11 \n", "\n", " median_aum concentration_ratio \n", "count 1.250100e+04 7708.000000 \n", "mean 2.573991e+05 0.790503 \n", "std 3.487976e+06 0.261535 \n", "min -2.317333e+06 -2.591840 \n", "25% 0.000000e+00 0.576503 \n", "50% 0.000000e+00 0.972159 \n", "75% 1.474502e+02 1.000000 \n", "max 2.215373e+08 2.983529 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1219/1645623303.py:17: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n" ] } ], "source": [ "# Diversification per account\n", "account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n", " n_isin_held=(\"Product - Isin\", \"nunique\"),\n", " n_funds_held=(\"Product - Fund\", \"nunique\"),\n", " n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n", " n_strategies=(\"Product - Strategy\", \"nunique\"),\n", " total_aum=(\"Value - AUM €\", \"sum\"),\n", " median_aum=(\"Value - AUM €\", \"median\")\n", ").reset_index()\n", "\n", "# Concentration ratio per account\n", "aum_by_account_fund = stocks.groupby(\n", " [\"Registrar Account - ID\", \"Product - Fund\"]\n", ")[\"Value - AUM €\"].sum().reset_index()\n", "\n", "concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n", " .reset_index(name=\"concentration_ratio\")\n", "\n", "# Merge diversification + concentration\n", "account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n", "\n", "print(account_static.head())\n", "print(account_static.describe())\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "76f6fa0d-9d7a-4145-af1c-986d83947f91", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID country region\n", "0 100000014 SWITZERLAND SWITZERLAND\n", "1 100000016 UNITED KINGDOM UNITED KINGDOM\n", "2 100000028 UNITED KINGDOM UNITED KINGDOM\n", "3 100000038 SWITZERLAND SWITZERLAND\n", "4 100000042 UNITED KINGDOM UNITED KINGDOM\n" ] } ], "source": [ "# Geographic info per account\n", "geo = stocks.groupby(\"Registrar Account - ID\").agg(\n", " country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n", " region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n", ").reset_index()\n", "\n", "print(geo.head())\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \\\n", "0 49.280511 0.000 -109.238 -1.092380e+08 \n", "1 NaN 0.000 -660.115 -6.601150e+08 \n", "2 NaN 0.000 -174.646 -1.746460e+08 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+08 \n", "\n", " churn_flag ... n_funds_held n_asset_types n_strategies total_aum \\\n", "0 0 ... 1.0 1.0 1.0 126236.2184 \n", "1 0 ... 1.0 1.0 1.0 446362.9015 \n", "2 0 ... 1.0 1.0 1.0 488743.4240 \n", "3 0 ... 0.0 0.0 0.0 NaN \n", "4 0 ... 2.0 2.0 2.0 373322.8948 \n", "\n", " median_aum concentration_ratio country region \\\n", "0 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "1 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "2 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "3 NaN NaN UNKNOWN UNKNOWN \n", "4 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "\n", " log_total_aum log_median_aum \n", "0 11.745918 0.0 \n", "1 13.008890 0.0 \n", "2 13.099595 0.0 \n", "3 NaN NaN \n", "4 12.830202 0.0 \n", "\n", "[5 rows x 24 columns]\n", " Registrar Account - ID n_days n_transactions total_netflows \\\n", "count 6842 6842.000000 6842.000000 6.842000e+03 \n", "unique 6842 NaN NaN NaN \n", "top 100000028 NaN NaN NaN \n", "freq 1 NaN NaN NaN \n", "mean NaN 122.731804 376.273166 3.426362e+04 \n", "std NaN 363.726141 1972.817028 1.644676e+06 \n", "min NaN 1.000000 1.000000 -9.982301e+06 \n", "25% NaN 2.000000 2.000000 -3.448661e+02 \n", "50% NaN 5.000000 6.000000 -1.116000e+00 \n", "75% NaN 27.000000 42.000000 4.220087e+01 \n", "max NaN 2715.000000 53314.000000 1.319043e+08 \n", "\n", " mean_flow std_flow total_subscription total_redemption \\\n", "count 6842.000000 5.696000e+03 6.842000e+03 6.842000e+03 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 420.503483 6.035653e+03 1.565831e+05 -1.223195e+05 \n", "std 15548.555778 7.652710e+04 4.204641e+06 2.600731e+06 \n", "min -333474.890000 0.000000e+00 -3.931320e+02 -2.069900e+08 \n", "25% -35.658375 5.643245e+01 0.000000e+00 -6.968600e+03 \n", "50% -0.067287 2.479222e+02 3.393685e+02 -4.493603e+02 \n", "75% 2.235534 9.869604e+02 6.000000e+03 -7.275400e+01 \n", "max 871531.706418 4.697263e+06 3.388942e+08 0.000000e+00 \n", "\n", " churn_ratio churn_flag ... n_funds_held n_asset_types \\\n", "count 6.842000e+03 6842.000000 ... 6842.000000 6842.000000 \n", "unique NaN NaN ... NaN NaN \n", "top NaN NaN ... NaN NaN \n", "freq NaN NaN ... NaN NaN \n", "mean -7.445486e+08 0.000146 ... 5.054224 2.114294 \n", "std 1.163193e+10 0.012090 ... 6.457703 1.342230 \n", "min -5.212597e+11 0.000000 ... 0.000000 0.000000 \n", "25% -2.279500e+06 0.000000 ... 1.000000 1.000000 \n", "50% -1.048786e+00 0.000000 ... 2.000000 2.000000 \n", "75% -9.333542e-01 0.000000 ... 7.000000 3.000000 \n", "max 3.225589e+01 1.000000 ... 67.000000 6.000000 \n", "\n", " n_strategies total_aum median_aum concentration_ratio \\\n", "count 6842.000000 6.724000e+03 6.724000e+03 6586.000000 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 4.631102 7.136776e+08 4.051506e+05 0.782903 \n", "std 5.493014 7.438256e+09 4.121668e+06 0.267426 \n", "min 0.000000 -9.151116e+07 -2.317333e+06 -2.591840 \n", "25% 1.000000 5.107611e+05 0.000000e+00 0.561855 \n", "50% 2.000000 4.221523e+06 0.000000e+00 0.965881 \n", "75% 6.000000 3.987106e+07 2.531545e+04 1.000000 \n", "max 48.000000 4.780234e+11 2.215373e+08 2.983529 \n", "\n", " country region log_total_aum log_median_aum \n", "count 6842 6842 6724.000000 6724.000000 \n", "unique 34 16 NaN NaN \n", "top FRANCE FRANCE NaN NaN \n", "freq 2631 2643 NaN NaN \n", "mean NaN NaN 15.046065 4.392450 \n", "std NaN NaN 4.320148 5.462132 \n", "min NaN NaN 0.000000 0.000000 \n", "25% NaN NaN 13.143657 0.000000 \n", "50% NaN NaN 15.255707 0.000000 \n", "75% NaN NaN 17.501160 10.139210 \n", "max NaN NaN 26.892926 19.216101 \n", "\n", "[11 rows x 24 columns]\n" ] } ], "source": [ "# 1. Merge behavior (flows) with static diversification (stocks)\n", "client_master = client_behavior.merge(\n", " account_static,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 2. Add geographic info\n", "client_master = client_master.merge(\n", " geo,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 3. Create additional engineered features\n", "client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n", "client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n", "\n", "\n", "# 4. Replace NaN flow volatility with 0 (inactive accounts)\n", "client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n", "\n", "# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n", "client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n", " client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n", "\n", "# 6. Fill missing geography as “UNKNOWN”\n", "client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n", "client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n", "\n", "# 7. Export\n", "client_master.to_csv(\"client_master.csv\", index=False)\n", "\n", "print(client_master.head())\n", "print(client_master.describe(include='all'))\n" ] }, { "cell_type": "markdown", "id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839", "metadata": {}, "source": [ "Détection des ruptures" ] }, { "cell_type": "code", "execution_count": 15, "id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196", "metadata": {}, "outputs": [], "source": [ "# --- 1. PREPARE STOCKS ---\n", "stocks_clean = stocks[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - AUM\"\n", "]].copy()\n", "\n", "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n", "stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n", "\n", "# --- 2. PREPARE FLOWS ---\n", "flows_clean = flows[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - NetFlows\"\n", "]].copy()\n", "\n", "flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n", "\n", "# Aggregate flows per day to avoid duplicates\n", "flows_clean = flows_clean.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n", ")[\"Quantity - NetFlows\"].sum().reset_index()\n", "\n", "# --- 3. MERGE STOCKS WITH FLOWS ---\n", "merged = stocks_clean.merge(\n", " flows_clean,\n", " on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n", " how=\"left\"\n", ")\n", "\n", "merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n", "\n", "# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n", "merged[\"prev_stock\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - AUM\"].shift(1)\n", "\n", "# SHIFT NET FLOWS FROM PREVIOUS DATE\n", "merged[\"prev_netflows\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n", "\n", "# Expected stock\n", "merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n", "\n", "# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n", "merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n", "\n", "# tolerance for numerical noise\n", "TOL = 1e-6\n", "merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n", "\n", "# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n", "rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n", " n_ruptures=(\"rupture_flag\", \"sum\"),\n", " total_obs=(\"rupture_flag\", \"count\"),\n", " rupture_ratio=(\"rupture_flag\", \"mean\"),\n", " max_gap=(\"gap\", lambda x: x.abs().max())\n", ").reset_index()\n", "\n", "# Sort by biggest anomalies\n", "rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n", "\n", "rupture_summary.head(10)\n", "\n", "rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n", "rupture_summary_asc.to_csv('rupture.csv')" ] }, { "cell_type": "code", "execution_count": 19, "id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06", "metadata": {}, "outputs": [], "source": [ "merged.to_csv('merged.csv')" ] }, { "cell_type": "code", "execution_count": 17, "id": "71cd67aa-f4b9-489e-b928-defeca459cb6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Registrar Account - ID | \n", "n_ruptures | \n", "total_obs | \n", "rupture_ratio | \n", "max_gap | \n", "
|---|---|---|---|---|---|
| 165 | \n", "200000331 | \n", "0 | \n", "160 | \n", "0.000000 | \n", "0.000 | \n", "
| 182 | \n", "200000361 | \n", "0 | \n", "80 | \n", "0.000000 | \n", "0.000 | \n", "
| 12198 | \n", "422302 | \n", "0 | \n", "240 | \n", "0.000000 | \n", "0.000 | \n", "
| 12197 | \n", "422299 | \n", "0 | \n", "80 | \n", "0.000000 | \n", "0.000 | \n", "
| 12191 | \n", "422288 | \n", "0 | \n", "1200 | \n", "0.000000 | \n", "0.000 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 6522 | \n", "365568 | \n", "237 | \n", "240 | \n", "0.987500 | \n", "16596.971 | \n", "
| 197 | \n", "200000407 | \n", "79 | \n", "80 | \n", "0.987500 | \n", "63893.601 | \n", "
| 6884 | \n", "365966 | \n", "79 | \n", "80 | \n", "0.987500 | \n", "2673.873 | \n", "
| 7039 | \n", "366351 | \n", "258 | \n", "260 | \n", "0.992308 | \n", "1998.948 | \n", "
| 603 | \n", "200001928 | \n", "645 | \n", "650 | \n", "0.992308 | \n", "110779.418 | \n", "
12501 rows × 5 columns
\n", "| \n", " | Registrar Account - ID | \n", "Product - Isin | \n", "n_ruptures | \n", "obs | \n", "rupture_ratio | \n", "max_gap | \n", "
|---|---|---|---|---|---|---|
| 17027 | \n", "200127410 | \n", "FR0010135103 | \n", "434 | \n", "436 | \n", "0.995413 | \n", "295985.420 | \n", "
| 17029 | \n", "200127410 | \n", "FR0010148981 | \n", "317 | \n", "319 | \n", "0.993730 | \n", "67134.706 | \n", "
| 68901 | \n", "PRIVATE CLIENT | \n", "LU0992630599 | \n", "154 | \n", "155 | \n", "0.993548 | \n", "529752.634 | \n", "
| 39099 | \n", "366441 | \n", "FR0010135103 | \n", "142 | \n", "143 | \n", "0.993007 | \n", "439160.588 | \n", "
| 39101 | \n", "366441 | \n", "FR0010148981 | \n", "142 | \n", "143 | \n", "0.993007 | \n", "86246.897 | \n", "
| 3083 | \n", "200001928 | \n", "LU0992624949 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "110779.418 | \n", "
| 2080 | \n", "200001349 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "23881.992 | \n", "
| 3084 | \n", "200001928 | \n", "LU0992625839 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "12675.630 | \n", "
| 31744 | \n", "365095 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "22990.942 | \n", "
| 2085 | \n", "200001349 | \n", "FR0010149302 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "412.499 | \n", "
| 65735 | \n", "422329 | \n", "FR0010306142 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "18289.694 | \n", "
| 3129 | \n", "200001939 | \n", "LU0592698954 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "57179.957 | \n", "
| 2090 | \n", "200001349 | \n", "FR0011269083 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "29403.491 | \n", "
| 2086 | \n", "200001349 | \n", "FR0010306142 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "9417.847 | \n", "
| 65730 | \n", "422329 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "20342.726 | \n", "
| 2108 | \n", "200001349 | \n", "LU0336083497 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "592.246 | \n", "
| 65618 | \n", "422310 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "6006.071 | \n", "
| 65613 | \n", "422310 | \n", "FR0010135103 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "4258.656 | \n", "
| 31804 | \n", "365096 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "844.519 | \n", "
| 31800 | \n", "365096 | \n", "FR0010148981 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "962.783 | \n", "