{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "126c8a80-d9ad-4816-84f0-0c3d580f62c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: openpyxl in /opt/python/lib/python3.13/site-packages (3.1.5)\n", "Requirement already satisfied: et-xmlfile in /opt/python/lib/python3.13/site-packages (from openpyxl) (2.0.0)\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "!pip install openpyxl" ] }, { "cell_type": "code", "execution_count": 3, "id": "ff2261fb-9516-4410-b42d-3acc8dc1a460", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "fs = s3fs.S3FileSystem(\n", " client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n", " key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n", " secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n", " token = os.environ[\"AWS_SESSION_TOKEN\"])" ] }, { "cell_type": "markdown", "id": "3d36f3f0-bd40-4a83-96d1-b46d75f5a4c5", "metadata": {}, "source": [ "# Data exploration" ] }, { "cell_type": "markdown", "id": "eaf5c5a0-eb1c-4242-b893-7600e6def109", "metadata": {}, "source": [ "Fonctions utiles" ] }, { "cell_type": "code", "execution_count": 5, "id": "60e2035c-c2f0-4c51-97df-102e67ba96ee", "metadata": {}, "outputs": [], "source": [ "def plot_account(account_id, isin=None):\n", " \"\"\"\n", " Plots the stock (Quantity - AUM) evolution for a given Registrar Account.\n", " Optionally, only for one ISIN.\n", " \"\"\"\n", "\n", " df = merged.copy()\n", "\n", " # Filter by account\n", " df = df[df[\"Registrar Account - ID\"] == account_id]\n", "\n", " if isin is not None:\n", " df = df[df[\"Product - Isin\"] == isin]\n", "\n", " if df.empty:\n", " print(f\"No data found for account {account_id}\")\n", " return\n", "\n", " df_plot = df.groupby(\"Centralisation Date\")[\"Quantity - AUM\"].sum().reset_index()\n", "\n", " df_plot = df_plot.sort_values(\"Centralisation Date\")\n", "\n", " # Plot\n", " plt.figure(figsize=(12, 4))\n", " plt.plot(df_plot[\"Centralisation Date\"], df_plot[\"Quantity - AUM\"], marker='o')\n", " plt.title(f\"Stock Evolution for Account {account_id}\", fontsize=14)\n", " plt.xlabel(\"Date\")\n", " plt.ylabel(\"Total AUM\")\n", " plt.grid(True)\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "37e008b1-32d4-44be-9d23-1b90a5a26f89", "metadata": {}, "outputs": [], "source": [ "# 2. BASIC INSPECTION\n", "\n", "def quick_info(df, name):\n", " print(\"\\n\" + \"=\"*80)\n", " print(f\"DATASET : {name}\")\n", " print(\"=\"*80)\n", " print(\"\\nShape :\", df.shape)\n", " print(\"\\nColumns :\", df.columns.tolist())\n", " print(\"\\nDtypes :\\n\", df.dtypes)\n", " print(\"\\nMissing values (%) :\\n\", df.isna().mean().sort_values(ascending=False)*100)\n", " print(\"\\nSample rows:\\n\", df.head(5))\n", " print(\"\\nUnique values per column:\\n\", df.nunique().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "e67a99ea-ddf4-4627-8f48-ec183c671acb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_11151/19230119.py:2: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " flows = pd.read_csv(f, sep=\";\")\n" ] } ], "source": [ "with fs.open('projet-bdc-data//carmignac/Flows ENSAE V2 -20251105.csv', 'rb') as f:\n", " flows = pd.read_csv(f, sep=\";\")\n", "\n", "with fs.open('projet-bdc-data//carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n", " stocks = pd.read_csv(f, sep=\";\")\n", "\n", "with fs.open('projet-bdc-data/carmignac/Monthly AUM and NAV since 2010.xlsx', 'rb') as f:\n", " nav_raw = pd.read_excel(f, header=None, engine=\"openpyxl\")\n", "nav = nav_raw[0].str.split(\",\", expand=True)\n", "nav.columns = nav.iloc[0]\n", "nav = nav[1:].reset_index(drop=True)\n", "\n", "quick_info(stocks, \"STOCKS\")\n", "quick_info(flows, \"FLOWS\")\n", "quick_info(nav, \"NAV/PRICES\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "9bc92c9f-216c-475e-bfb8-edc1a4e839f6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Date conversion done.\n", "NAV numeric conversion done.\n", "String normalization done.\n", "\n", "ISIN missing in FLOWS but present in STOCKS : 17\n", "\n", "ISIN missing in STOCKS but present in FLOWS : 0\n", "\n", "ISIN missing in NAV but present in FLOWS : 67\n", "\n", "ISIN missing in NAV but present in STOCKS : 76\n", "\n", "Accounts in STOCKS but NEVER in FLOWS : 5777\n", "\n", "Accounts in FLOWS but NEVER in STOCKS : 118\n", "\n", "CLIENT BEHAVIOR (first 5 rows):\n", " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \n", "0 49.280511 0.000 -109.238 -1.092380e+11 \n", "1 NaN 0.000 -660.115 -6.601150e+11 \n", "2 NaN 0.000 -174.646 -1.746460e+11 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+11 \n", "\n", "FUND BEHAVIOR (first 5 rows):\n", " Product - Isin n_accounts n_days total_netflows vol_flows\n", "0 FR0010135103 2690 2723 -2.571327e+07 2622.609244\n", "1 FR0010147603 733 2719 -2.562187e+06 1206.248205\n", "2 FR0010148981 1841 2722 -3.609440e+06 1051.069183\n", "3 FR0010148999 454 2306 -7.130297e+05 1265.364138\n", "4 FR0010149112 934 2000 -9.438901e+05 1834.961721\n" ] } ], "source": [ "# 1. CLEAN DATES (formats différents)\n", "\n", "stocks[\"Centralisation Date\"] = pd.to_datetime(stocks[\"Centralisation Date\"], errors=\"coerce\")\n", "flows[\"Centralisation Date\"] = pd.to_datetime(flows[\"Centralisation Date\"], errors=\"coerce\")\n", "nav[\"NavDate\"] = pd.to_datetime(nav[\"NavDate\"], format=\"%d/%m/%Y\", errors=\"coerce\")\n", "\n", "print(\"Date conversion done.\")\n", "\n", "# 2. CLEAN NUMERIC COLUMNS FOR NAV FILE\n", "\n", "num_cols = [\"PortfolioAum_Eur\",\"ShareClassPrice\",\"NumberOfShares\",\n", " \"ShareClassAumLocalCur\",\"ShareClassAum_EUR\"]\n", "\n", "for col in num_cols:\n", " nav[col] = (\n", " nav[col]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .str.replace(\" \", \"\")\n", " .astype(float)\n", " )\n", "\n", "print(\"NAV numeric conversion done.\")\n", "\n", "# 3. STANDARDIZE STRINGS FOR JOIN KEYS\n", "\n", "def norm(df):\n", " for col in df.columns:\n", " if df[col].dtype == \"object\":\n", " df[col] = df[col].astype(str).str.strip().str.upper()\n", " return df\n", "\n", "stocks = norm(stocks)\n", "flows = norm(flows)\n", "nav = norm(nav)\n", "\n", "print(\"String normalization done.\")\n", "\n", "\n", "# 4. ANALYSE RELATIONS ACROSS FILES\n", "\n", "# Unique sets\n", "isin_stocks = set(stocks[\"Product - Isin\"].unique())\n", "isin_flows = set(flows[\"Product - Isin\"].unique())\n", "isin_nav = set(nav[\"ShareClassIsin\"].unique())\n", "\n", "print(\"\\nISIN missing in FLOWS but present in STOCKS :\", len(isin_stocks - isin_flows))\n", "print(\"\\nISIN missing in STOCKS but present in FLOWS :\", len(isin_flows - isin_stocks))\n", "print(\"\\nISIN missing in NAV but present in FLOWS :\", len(isin_flows - isin_nav))\n", "print(\"\\nISIN missing in NAV but present in STOCKS :\", len(isin_stocks - isin_nav))\n", "\n", "\n", "# 5. CLIENTS: STOCKS VS FLOWS\n", "\n", "acc_stocks = set(stocks[\"Registrar Account - ID\"].unique())\n", "acc_flows = set(flows[\"Registrar Account - ID\"].unique())\n", "\n", "print(\"\\nAccounts in STOCKS but NEVER in FLOWS :\", len(acc_stocks - acc_flows))\n", "print(\"\\nAccounts in FLOWS but NEVER in STOCKS :\", len(acc_flows - acc_stocks))\n", "\n", "\n", "# 6. CLIENT ACTIVITY METRICS (DETAILED)\n", "\n", "client_behavior = flows.groupby(\"Registrar Account - ID\").agg(\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " n_transactions=(\"Quantity - NetFlows\", \"count\"),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " mean_flow=(\"Quantity - NetFlows\", \"mean\"),\n", " std_flow=(\"Quantity - NetFlows\", \"std\"),\n", " total_subscription=(\"Quantity - Subscription\", \"sum\"),\n", " total_redemption=(\"Quantity - Redemption\", \"sum\")\n", ").reset_index()\n", "\n", "# Add churn metric\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + 1e-9)\n", ")\n", "\n", "print(\"\\nCLIENT BEHAVIOR (first 5 rows):\\n\", client_behavior.head())\n", "\n", "\n", "# 7. FUNDS ACTIVITY METRICS\n", "\n", "fund_behavior = flows.groupby(\"Product - Isin\").agg(\n", " n_accounts=(\"Registrar Account - ID\", \"nunique\"),\n", " n_days=(\"Centralisation Date\", lambda x: x.nunique()),\n", " total_netflows=(\"Quantity - NetFlows\", \"sum\"),\n", " vol_flows=(\"Quantity - NetFlows\", \"std\")\n", ").reset_index()\n", "\n", "print(\"\\nFUND BEHAVIOR (first 5 rows):\\n\", fund_behavior.head())\n", "\n", "\n", "# 8. SAVE INTERMEDIATE\n", "\n", "client_behavior.to_csv(\"client_behavior.csv\", index=False)\n", "fund_behavior.to_csv(\"fund_behavior.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 10, "id": "afb51598-3a7b-41f2-8d25-5b4b8bfb1c8a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FULL usable ISIN : 407\n", "Stocks only ISIN : 17\n", "Flows only ISIN : 0\n", "Missing NAV : 76\n", "All ISIN groups saved into 4 separate files.\n" ] } ], "source": [ "valid_full = isin_stocks & isin_flows & isin_nav\n", "stocks_only = isin_stocks - isin_flows\n", "flows_only = isin_flows - isin_stocks\n", "missing_nav = (isin_stocks | isin_flows) - isin_nav\n", "\n", "print(\"FULL usable ISIN :\", len(valid_full))\n", "print(\"Stocks only ISIN :\", len(stocks_only))\n", "print(\"Flows only ISIN :\", len(flows_only))\n", "print(\"Missing NAV :\", len(missing_nav))\n", "\n", "pd.DataFrame({\"isin\": list(valid_full)}).to_csv(\"isin_full.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(stocks_only)}).to_csv(\"isin_stocks_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(flows_only)}).to_csv(\"isin_flows_only.csv\", index=False)\n", "pd.DataFrame({\"isin\": list(missing_nav)}).to_csv(\"isin_missing_nav.csv\", index=False)\n", "\n", "print(\"All ISIN groups saved into 4 separate files.\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "61e0c71a-a1c6-4ed8-ba15-b7a9badc4d4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \\\n", "0 49.280511 0.000 -109.238 -1.092380e+08 \n", "1 NaN 0.000 -660.115 -6.601150e+08 \n", "2 NaN 0.000 -174.646 -1.746460e+08 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+08 \n", "\n", " churn_flag activity_score flow_volatility inertia_ratio \n", "0 0 1.386294 49.280511 0.998921 \n", "1 0 0.693147 0.000000 0.999640 \n", "2 0 0.693147 0.000000 0.999640 \n", "3 0 4.304065 2168.971331 0.976619 \n", "4 0 0.693147 0.000000 0.999640 \n" ] } ], "source": [ "eps = 1e-6\n", "\n", "client_behavior[\"churn_ratio\"] = (\n", " client_behavior[\"total_redemption\"] /\n", " (client_behavior[\"total_subscription\"] + eps)\n", ")\n", "\n", "client_behavior[\"churn_flag\"] = (\n", " client_behavior[\"total_redemption\"] > client_behavior[\"total_subscription\"]\n", ").astype(int)\n", "\n", "client_behavior[\"activity_score\"] = np.log1p(client_behavior[\"n_transactions\"])\n", "\n", "client_behavior[\"flow_volatility\"] = client_behavior[\"std_flow\"].fillna(0)\n", "\n", "client_behavior[\"inertia_ratio\"] = (\n", " 1 - client_behavior[\"n_days\"] / flows[\"Centralisation Date\"].nunique()\n", ")\n", "\n", "print(client_behavior.head())\n", "\n", "client_behavior.to_csv(\"client_behavior_clean.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "8ee7e911-eb73-4846-b545-661140411c1b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1219/1645623303.py:17: RuntimeWarning: invalid value encountered in scalar divide\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_isin_held n_funds_held n_asset_types \\\n", "0 100000014 1 1 1 \n", "1 100000016 2 2 2 \n", "2 100000028 1 1 1 \n", "3 100000038 3 3 2 \n", "4 100000042 1 1 1 \n", "\n", " n_strategies total_aum median_aum concentration_ratio \n", "0 1 0.0000 0.0 NaN \n", "1 2 0.0000 0.0 NaN \n", "2 1 126236.2184 0.0 1.0 \n", "3 3 0.0000 0.0 NaN \n", "4 1 446362.9015 0.0 1.0 \n", " n_isin_held n_funds_held n_asset_types n_strategies total_aum \\\n", "count 12501.000000 12501.000000 12501.000000 12501.000000 1.250100e+04 \n", "mean 5.514759 4.408367 2.082473 4.109271 4.218474e+08 \n", "std 10.434698 5.472756 1.254048 4.714800 5.618341e+09 \n", "min 1.000000 1.000000 1.000000 1.000000 -2.586805e+08 \n", "25% 1.000000 1.000000 1.000000 1.000000 0.000000e+00 \n", "50% 2.000000 2.000000 2.000000 2.000000 2.587605e+05 \n", "75% 6.000000 5.000000 3.000000 5.000000 8.817014e+06 \n", "max 469.000000 67.000000 6.000000 48.000000 4.780234e+11 \n", "\n", " median_aum concentration_ratio \n", "count 1.250100e+04 7708.000000 \n", "mean 2.573991e+05 0.790503 \n", "std 3.487976e+06 0.261535 \n", "min -2.317333e+06 -2.591840 \n", "25% 0.000000e+00 0.576503 \n", "50% 0.000000e+00 0.972159 \n", "75% 1.474502e+02 1.000000 \n", "max 2.215373e+08 2.983529 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1219/1645623303.py:17: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n" ] } ], "source": [ "# Diversification per account\n", "account_div = stocks.groupby(\"Registrar Account - ID\").agg(\n", " n_isin_held=(\"Product - Isin\", \"nunique\"),\n", " n_funds_held=(\"Product - Fund\", \"nunique\"),\n", " n_asset_types=(\"Product - Asset Type\", \"nunique\"),\n", " n_strategies=(\"Product - Strategy\", \"nunique\"),\n", " total_aum=(\"Value - AUM €\", \"sum\"),\n", " median_aum=(\"Value - AUM €\", \"median\")\n", ").reset_index()\n", "\n", "# Concentration ratio per account\n", "aum_by_account_fund = stocks.groupby(\n", " [\"Registrar Account - ID\", \"Product - Fund\"]\n", ")[\"Value - AUM €\"].sum().reset_index()\n", "\n", "concentration = aum_by_account_fund.groupby(\"Registrar Account - ID\") \\\n", " .apply(lambda x: x[\"Value - AUM €\"].max() / x[\"Value - AUM €\"].sum()) \\\n", " .reset_index(name=\"concentration_ratio\")\n", "\n", "# Merge diversification + concentration\n", "account_static = account_div.merge(concentration, on=\"Registrar Account - ID\", how=\"left\")\n", "\n", "print(account_static.head())\n", "print(account_static.describe())\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "76f6fa0d-9d7a-4145-af1c-986d83947f91", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID country region\n", "0 100000014 SWITZERLAND SWITZERLAND\n", "1 100000016 UNITED KINGDOM UNITED KINGDOM\n", "2 100000028 UNITED KINGDOM UNITED KINGDOM\n", "3 100000038 SWITZERLAND SWITZERLAND\n", "4 100000042 UNITED KINGDOM UNITED KINGDOM\n" ] } ], "source": [ "# Geographic info per account\n", "geo = stocks.groupby(\"Registrar Account - ID\").agg(\n", " country=(\"RegistrarAccount - Country\", lambda x: x.mode()[0]),\n", " region=(\"Registrar Account - Region\", lambda x: x.mode()[0])\n", ").reset_index()\n", "\n", "print(geo.head())\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "e9bb67ab-9029-4ace-b960-b3d6e0b8683c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Registrar Account - ID n_days n_transactions total_netflows mean_flow \\\n", "0 100000028 3 3 -109.238 -36.412667 \n", "1 100000042 1 1 -660.115 -660.115000 \n", "2 100000065 1 1 -174.646 -174.646000 \n", "3 100000069 65 73 -7479.755 -102.462397 \n", "4 100000073 1 1 -133.402 -133.402000 \n", "\n", " std_flow total_subscription total_redemption churn_ratio \\\n", "0 49.280511 0.000 -109.238 -1.092380e+08 \n", "1 NaN 0.000 -660.115 -6.601150e+08 \n", "2 NaN 0.000 -174.646 -1.746460e+08 \n", "3 2168.971331 33320.402 -40800.157 -1.224480e+00 \n", "4 NaN 0.000 -133.402 -1.334020e+08 \n", "\n", " churn_flag ... n_funds_held n_asset_types n_strategies total_aum \\\n", "0 0 ... 1.0 1.0 1.0 126236.2184 \n", "1 0 ... 1.0 1.0 1.0 446362.9015 \n", "2 0 ... 1.0 1.0 1.0 488743.4240 \n", "3 0 ... 0.0 0.0 0.0 NaN \n", "4 0 ... 2.0 2.0 2.0 373322.8948 \n", "\n", " median_aum concentration_ratio country region \\\n", "0 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "1 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "2 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "3 NaN NaN UNKNOWN UNKNOWN \n", "4 0.0 1.0 UNITED KINGDOM UNITED KINGDOM \n", "\n", " log_total_aum log_median_aum \n", "0 11.745918 0.0 \n", "1 13.008890 0.0 \n", "2 13.099595 0.0 \n", "3 NaN NaN \n", "4 12.830202 0.0 \n", "\n", "[5 rows x 24 columns]\n", " Registrar Account - ID n_days n_transactions total_netflows \\\n", "count 6842 6842.000000 6842.000000 6.842000e+03 \n", "unique 6842 NaN NaN NaN \n", "top 100000028 NaN NaN NaN \n", "freq 1 NaN NaN NaN \n", "mean NaN 122.731804 376.273166 3.426362e+04 \n", "std NaN 363.726141 1972.817028 1.644676e+06 \n", "min NaN 1.000000 1.000000 -9.982301e+06 \n", "25% NaN 2.000000 2.000000 -3.448661e+02 \n", "50% NaN 5.000000 6.000000 -1.116000e+00 \n", "75% NaN 27.000000 42.000000 4.220087e+01 \n", "max NaN 2715.000000 53314.000000 1.319043e+08 \n", "\n", " mean_flow std_flow total_subscription total_redemption \\\n", "count 6842.000000 5.696000e+03 6.842000e+03 6.842000e+03 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 420.503483 6.035653e+03 1.565831e+05 -1.223195e+05 \n", "std 15548.555778 7.652710e+04 4.204641e+06 2.600731e+06 \n", "min -333474.890000 0.000000e+00 -3.931320e+02 -2.069900e+08 \n", "25% -35.658375 5.643245e+01 0.000000e+00 -6.968600e+03 \n", "50% -0.067287 2.479222e+02 3.393685e+02 -4.493603e+02 \n", "75% 2.235534 9.869604e+02 6.000000e+03 -7.275400e+01 \n", "max 871531.706418 4.697263e+06 3.388942e+08 0.000000e+00 \n", "\n", " churn_ratio churn_flag ... n_funds_held n_asset_types \\\n", "count 6.842000e+03 6842.000000 ... 6842.000000 6842.000000 \n", "unique NaN NaN ... NaN NaN \n", "top NaN NaN ... NaN NaN \n", "freq NaN NaN ... NaN NaN \n", "mean -7.445486e+08 0.000146 ... 5.054224 2.114294 \n", "std 1.163193e+10 0.012090 ... 6.457703 1.342230 \n", "min -5.212597e+11 0.000000 ... 0.000000 0.000000 \n", "25% -2.279500e+06 0.000000 ... 1.000000 1.000000 \n", "50% -1.048786e+00 0.000000 ... 2.000000 2.000000 \n", "75% -9.333542e-01 0.000000 ... 7.000000 3.000000 \n", "max 3.225589e+01 1.000000 ... 67.000000 6.000000 \n", "\n", " n_strategies total_aum median_aum concentration_ratio \\\n", "count 6842.000000 6.724000e+03 6.724000e+03 6586.000000 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 4.631102 7.136776e+08 4.051506e+05 0.782903 \n", "std 5.493014 7.438256e+09 4.121668e+06 0.267426 \n", "min 0.000000 -9.151116e+07 -2.317333e+06 -2.591840 \n", "25% 1.000000 5.107611e+05 0.000000e+00 0.561855 \n", "50% 2.000000 4.221523e+06 0.000000e+00 0.965881 \n", "75% 6.000000 3.987106e+07 2.531545e+04 1.000000 \n", "max 48.000000 4.780234e+11 2.215373e+08 2.983529 \n", "\n", " country region log_total_aum log_median_aum \n", "count 6842 6842 6724.000000 6724.000000 \n", "unique 34 16 NaN NaN \n", "top FRANCE FRANCE NaN NaN \n", "freq 2631 2643 NaN NaN \n", "mean NaN NaN 15.046065 4.392450 \n", "std NaN NaN 4.320148 5.462132 \n", "min NaN NaN 0.000000 0.000000 \n", "25% NaN NaN 13.143657 0.000000 \n", "50% NaN NaN 15.255707 0.000000 \n", "75% NaN NaN 17.501160 10.139210 \n", "max NaN NaN 26.892926 19.216101 \n", "\n", "[11 rows x 24 columns]\n" ] } ], "source": [ "# 1. Merge behavior (flows) with static diversification (stocks)\n", "client_master = client_behavior.merge(\n", " account_static,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 2. Add geographic info\n", "client_master = client_master.merge(\n", " geo,\n", " on=\"Registrar Account - ID\",\n", " how=\"left\"\n", ")\n", "\n", "# 3. Create additional engineered features\n", "client_master[\"log_total_aum\"] = np.log1p(client_master[\"total_aum\"].clip(lower=0))\n", "client_master[\"log_median_aum\"] = np.log1p(client_master[\"median_aum\"].clip(lower=0))\n", "\n", "\n", "# 4. Replace NaN flow volatility with 0 (inactive accounts)\n", "client_master[\"flow_volatility\"] = client_master[\"flow_volatility\"].fillna(0)\n", "\n", "# 5. Fill missing diversification metrics with 0 (for accounts without stocks)\n", "client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]] = \\\n", " client_master[[\"n_isin_held\",\"n_funds_held\",\"n_asset_types\",\"n_strategies\"]].fillna(0)\n", "\n", "# 6. Fill missing geography as “UNKNOWN”\n", "client_master[\"country\"] = client_master[\"country\"].fillna(\"UNKNOWN\")\n", "client_master[\"region\"] = client_master[\"region\"].fillna(\"UNKNOWN\")\n", "\n", "# 7. Export\n", "client_master.to_csv(\"client_master.csv\", index=False)\n", "\n", "print(client_master.head())\n", "print(client_master.describe(include='all'))\n" ] }, { "cell_type": "markdown", "id": "fb1e98a5-6ab4-4371-ba45-6558ff38c839", "metadata": {}, "source": [ "Détection des ruptures" ] }, { "cell_type": "code", "execution_count": 15, "id": "6bdd8077-c8e0-451d-a7b8-15a2705ad196", "metadata": {}, "outputs": [], "source": [ "# --- 1. PREPARE STOCKS ---\n", "stocks_clean = stocks[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - AUM\"\n", "]].copy()\n", "\n", "stocks_clean[\"Centralisation Date\"] = pd.to_datetime(stocks_clean[\"Centralisation Date\"])\n", "stocks_clean = stocks_clean.sort_values([\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"])\n", "\n", "# --- 2. PREPARE FLOWS ---\n", "flows_clean = flows[[\n", " \"Registrar Account - ID\", \"Product - Isin\", \n", " \"Centralisation Date\", \"Quantity - NetFlows\"\n", "]].copy()\n", "\n", "flows_clean[\"Centralisation Date\"] = pd.to_datetime(flows_clean[\"Centralisation Date\"])\n", "\n", "# Aggregate flows per day to avoid duplicates\n", "flows_clean = flows_clean.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"]\n", ")[\"Quantity - NetFlows\"].sum().reset_index()\n", "\n", "# --- 3. MERGE STOCKS WITH FLOWS ---\n", "merged = stocks_clean.merge(\n", " flows_clean,\n", " on=[\"Registrar Account - ID\", \"Product - Isin\", \"Centralisation Date\"],\n", " how=\"left\"\n", ")\n", "\n", "merged[\"Quantity - NetFlows\"] = merged[\"Quantity - NetFlows\"].fillna(0)\n", "\n", "# --- 4. SHIFT STOCKS TO COMPARE t vs t+1 ---\n", "merged[\"prev_stock\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - AUM\"].shift(1)\n", "\n", "# SHIFT NET FLOWS FROM PREVIOUS DATE\n", "merged[\"prev_netflows\"] = merged.groupby(\n", " [\"Registrar Account - ID\", \"Product - Isin\"]\n", ")[\"Quantity - NetFlows\"].shift(1).fillna(0)\n", "\n", "# Expected stock\n", "merged[\"expected_stock\"] = merged[\"prev_stock\"] + merged[\"prev_netflows\"]\n", "\n", "# --- 5. COMPUTE GAP BETWEEN EXPECTED AND REAL ---\n", "merged[\"gap\"] = merged[\"Quantity - AUM\"] - merged[\"expected_stock\"]\n", "\n", "# tolerance for numerical noise\n", "TOL = 1e-6\n", "merged[\"rupture_flag\"] = (merged[\"prev_stock\"].notna()) & (merged[\"gap\"].abs() > TOL)\n", "\n", "# --- 6. AGGREGATE BY CLIENT TO DETECT BIG ISSUES ---\n", "rupture_summary = merged.groupby(\"Registrar Account - ID\").agg(\n", " n_ruptures=(\"rupture_flag\", \"sum\"),\n", " total_obs=(\"rupture_flag\", \"count\"),\n", " rupture_ratio=(\"rupture_flag\", \"mean\"),\n", " max_gap=(\"gap\", lambda x: x.abs().max())\n", ").reset_index()\n", "\n", "# Sort by biggest anomalies\n", "rupture_summary = rupture_summary.sort_values(\"rupture_ratio\", ascending=False)\n", "\n", "rupture_summary.head(10)\n", "\n", "rupture_summary_asc = rupture_summary.sort_values(\"rupture_ratio\", ascending=True)\n", "rupture_summary_asc.to_csv('rupture.csv')" ] }, { "cell_type": "code", "execution_count": 19, "id": "9e32fd6b-4754-4196-9487-ffdc0bb4fc06", "metadata": {}, "outputs": [], "source": [ "merged.to_csv('merged.csv')" ] }, { "cell_type": "code", "execution_count": 17, "id": "71cd67aa-f4b9-489e-b928-defeca459cb6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Registrar Account - ID | \n", "n_ruptures | \n", "total_obs | \n", "rupture_ratio | \n", "max_gap | \n", "
|---|---|---|---|---|---|
| 165 | \n", "200000331 | \n", "0 | \n", "160 | \n", "0.000000 | \n", "0.000 | \n", "
| 182 | \n", "200000361 | \n", "0 | \n", "80 | \n", "0.000000 | \n", "0.000 | \n", "
| 12198 | \n", "422302 | \n", "0 | \n", "240 | \n", "0.000000 | \n", "0.000 | \n", "
| 12197 | \n", "422299 | \n", "0 | \n", "80 | \n", "0.000000 | \n", "0.000 | \n", "
| 12191 | \n", "422288 | \n", "0 | \n", "1200 | \n", "0.000000 | \n", "0.000 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 6522 | \n", "365568 | \n", "237 | \n", "240 | \n", "0.987500 | \n", "16596.971 | \n", "
| 197 | \n", "200000407 | \n", "79 | \n", "80 | \n", "0.987500 | \n", "63893.601 | \n", "
| 6884 | \n", "365966 | \n", "79 | \n", "80 | \n", "0.987500 | \n", "2673.873 | \n", "
| 7039 | \n", "366351 | \n", "258 | \n", "260 | \n", "0.992308 | \n", "1998.948 | \n", "
| 603 | \n", "200001928 | \n", "645 | \n", "650 | \n", "0.992308 | \n", "110779.418 | \n", "
12501 rows × 5 columns
\n", "| \n", " | Registrar Account - ID | \n", "Product - Isin | \n", "n_ruptures | \n", "obs | \n", "rupture_ratio | \n", "max_gap | \n", "
|---|---|---|---|---|---|---|
| 17027 | \n", "200127410 | \n", "FR0010135103 | \n", "434 | \n", "436 | \n", "0.995413 | \n", "295985.420 | \n", "
| 17029 | \n", "200127410 | \n", "FR0010148981 | \n", "317 | \n", "319 | \n", "0.993730 | \n", "67134.706 | \n", "
| 68901 | \n", "PRIVATE CLIENT | \n", "LU0992630599 | \n", "154 | \n", "155 | \n", "0.993548 | \n", "529752.634 | \n", "
| 39099 | \n", "366441 | \n", "FR0010135103 | \n", "142 | \n", "143 | \n", "0.993007 | \n", "439160.588 | \n", "
| 39101 | \n", "366441 | \n", "FR0010148981 | \n", "142 | \n", "143 | \n", "0.993007 | \n", "86246.897 | \n", "
| 3083 | \n", "200001928 | \n", "LU0992624949 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "110779.418 | \n", "
| 2080 | \n", "200001349 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "23881.992 | \n", "
| 3084 | \n", "200001928 | \n", "LU0992625839 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "12675.630 | \n", "
| 31744 | \n", "365095 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "22990.942 | \n", "
| 2085 | \n", "200001349 | \n", "FR0010149302 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "412.499 | \n", "
| 65735 | \n", "422329 | \n", "FR0010306142 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "18289.694 | \n", "
| 3129 | \n", "200001939 | \n", "LU0592698954 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "57179.957 | \n", "
| 2090 | \n", "200001349 | \n", "FR0011269083 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "29403.491 | \n", "
| 2086 | \n", "200001349 | \n", "FR0010306142 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "9417.847 | \n", "
| 65730 | \n", "422329 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "20342.726 | \n", "
| 2108 | \n", "200001349 | \n", "LU0336083497 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "592.246 | \n", "
| 65618 | \n", "422310 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "6006.071 | \n", "
| 65613 | \n", "422310 | \n", "FR0010135103 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "4258.656 | \n", "
| 31804 | \n", "365096 | \n", "FR0010149120 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "844.519 | \n", "
| 31800 | \n", "365096 | \n", "FR0010148981 | \n", "129 | \n", "130 | \n", "0.992308 | \n", "962.783 | \n", "