{ "cells": [ { "cell_type": "markdown", "id": "e637deae-9168-4fb2-b95f-4e42d8d72d9e", "metadata": {}, "source": [ "# DATA COLLECTION " ] }, { "cell_type": "code", "execution_count": 1, "id": "9f99615b-5a9d-434a-baa0-dca55edf7699", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "f8508d94-74a7-4bb0-8b81-c2e06850c25f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "chemin_fichier = \"s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv\"\n", "df_aum2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n", "df_aum2" ] }, { "cell_type": "code", "execution_count": null, "id": "4644da13-5aea-4ca0-9fcf-947324766292", "metadata": {}, "outputs": [], "source": [ "chemin_fichier = \"s3://projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\"\n", "df_flows2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n", "df_flows2" ] }, { "cell_type": "markdown", "id": "59d31eaf-c06c-4ebe-9f8c-cb9158a50976", "metadata": {}, "source": [ "## DATA ANALYSIS" ] }, { "cell_type": "code", "execution_count": null, "id": "5773b911-6b84-448d-962f-8228eeac0250", "metadata": {}, "outputs": [], "source": [ "df_aum2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "6f571810-c373-4d30-8ca5-c3a074b95b08", "metadata": {}, "outputs": [], "source": [ "df_aum2.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "af25fd07-a613-4adc-b88b-93a8d300379c", "metadata": {}, "outputs": [], "source": [ "df_flows2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "c6d0fe83-2957-430b-89cf-cd30833b7cab", "metadata": {}, "outputs": [], "source": [ "df_flows2.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "5fac74b0-662f-48d0-a234-7edc3c3e86ad", "metadata": {}, "outputs": [], "source": [ "#dict avec valeurs unique de chaque col \n", "rows = []\n", "\n", "for col in df_aum2.columns:\n", " uniques = df_aum2[col].unique()\n", " rows.append({\n", " \"Colonne\": col,\n", " \"Nbr Lignes\": df_aum2.shape[0], #4.8millions\n", " \"Nb valeurs uniques\": len(uniques),\n", " \"Exemples de valeurs\": uniques,\n", " \"Nan Values\" : df_aum2[col].isna().sum()\n", " })\n", "\n", "df_uniques = pd.DataFrame(rows)\n", "df_uniques" ] }, { "cell_type": "code", "execution_count": null, "id": "5de53ba3-b3db-4935-acac-435b05b909e2", "metadata": { "scrolled": true }, "outputs": [], "source": [ "#dict avec valeurs unique de chaque col \n", "rowsf = []\n", "\n", "for col in df_flows2.columns:\n", " uniques = df_flows2[col].unique()\n", " rowsf.append({\n", " \"Colonne\": col,\n", " \"Nbr Lignes\": df_flows2.shape[0], #4.8millions\n", " \"Nb valeurs uniques\": len(uniques),\n", " \"Exemples de valeurs\": uniques,\n", " \"Nan Values\" : df_flows2[col].isna().sum()\n", " })\n", "\n", "df_unique_flows = pd.DataFrame(rowsf)\n", "df_unique_flows" ] }, { "cell_type": "code", "execution_count": null, "id": "e07a2b28-13f7-49f6-a55b-7d09a506407b", "metadata": {}, "outputs": [], "source": [ "df1_aum = df_uniques[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n", "df2_flows = df_unique_flows[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n", "\n", "df_merged = df1_aum.merge(df2_flows, on='Colonne', suffixes=('_aum', '_flows'))\n", "df_merged" ] }, { "cell_type": "markdown", "id": "4ce2ad22-08e6-4e63-96b2-c2301172516e", "metadata": {}, "source": [ "# ETUDE ET ANALYSE DES ANOMALIES" ] }, { "cell_type": "code", "execution_count": null, "id": "c883dfc2-b9b9-4d3e-80d3-0140cd222492", "metadata": {}, "outputs": [], "source": [ "df_aum2['Centralisation Date'] = pd.to_datetime(df_aum2['Centralisation Date'])\n", "df_flows2['Centralisation Date'] = pd.to_datetime(df_flows2['Centralisation Date'])\n", "key_cols = ['Registrar Account - ID', 'Product - Isin']" ] }, { "cell_type": "code", "execution_count": null, "id": "f47b276d-cce6-433c-87c5-860810d71d34", "metadata": {}, "outputs": [], "source": [ "cols= ['Company - Id', 'Company - Ultimate Parent Id',\n", " 'Registrar Account - ID', 'Registrar Account - Region','Product - Isin']\n", "\n", "doublons_aum2 = df_aum2[df_aum2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n", "doublons_flows2 = df_flows2[df_flows2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n", "\n", "print(\" Cols: \", cols)\n", "print(\"Doublons AUM:\", doublons_aum2.shape[0])\n", "print(\"Doublons Flows:\", doublons_flows2.shape[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "d2f355e6-30c5-420e-a3db-9095dd5e0147", "metadata": {}, "outputs": [], "source": [ "# # Comptes avec same flux et same product ISIN mais IDs différents ---> candidats pseudo-client (FAUX)\n", "# #plusieurs comptes diff réalisent EXACTEMENT le même flux sur le même produit le même jour.\n" ] }, { "cell_type": "markdown", "id": "4b9173e1-1c01-4ef3-adcd-9c587a97dd5d", "metadata": {}, "source": [ "## Stat Descrp FLOWS " ] }, { "cell_type": "code", "execution_count": null, "id": "8df98c34-b1f7-4fb9-bbc7-c9bbe762022a", "metadata": {}, "outputs": [], "source": [ "df = df_flows2.copy()\n", "df[\"Date\"] = pd.to_datetime(df[\"Centralisation Date\"])\n", "\n", "# Groupby par ISIN et Date\n", "grouped = df.groupby([\"Product - Isin\", \"Date\"])\n", "\n", "transfers = []\n", "\n", "for (isin, date), group in grouped:\n", " # Sépare flux positifs et négatifs\n", " entrants = group[group[\"Value € - NetFlows\"] > 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n", " sortants = group[group[\"Value € - NetFlows\"] < 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n", "\n", " # On cherche des paires +M / -M\n", " for _, row_sortie in sortants.iterrows():\n", " montant_sortie = row_sortie[\"Value € - NetFlows\"]\n", " compte_sortant = row_sortie[\"Registrar Account - ID\"]\n", "\n", " # Chercher un +M qui matche exactement le -M\n", " match = entrants[entrants[\"Value € - NetFlows\"] == -montant_sortie]\n", "\n", " if len(match) > 0:\n", " for _, row_entree in match.iterrows():\n", " transfers.append({\n", " \"ISIN\": isin,\n", " \"Date\": date,\n", " \"Compte sortant\": compte_sortant,\n", " \"Montant sortie\": montant_sortie,\n", " \"Compte entrant\": row_entree[\"Registrar Account - ID\"],\n", " \"Montant entrée\": row_entree[\"Value € - NetFlows\"]\n", " })\n", "\n", "\n", "transf_compte = pd.DataFrame(transfers)\n", "transf_compte\n", "\n", "#df initiale : 2 574 461 \n", "# 27 880 rows " ] }, { "cell_type": "code", "execution_count": null, "id": "df0c0bbb-4cff-4205-86e0-0393f83a4cc7", "metadata": {}, "outputs": [], "source": [ "# Extraire tous les comptes sortants et entrants\n", "all_accounts = pd.concat([\n", " transf_compte[\"Compte sortant\"],\n", " transf_compte[\"Compte entrant\"]\n", "])\n", "\n", "# Comptes uniques impliqués dans au moins un transfert\n", "unique_accounts = all_accounts.unique()\n", "print(f\"Nombre de comptes uniques impliqués dans les transferts : {len(unique_accounts)}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6dacddcb-74f1-441f-adbe-83275c8f9216", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n", "\n", "# Nombre de transferts par jour\n", "transfers_per_day = transf_compte.groupby(\"Date\").size()\n", "\n", "plt.figure(figsize=(14,5))\n", "transfers_per_day.plot(kind=\"line\")\n", "plt.title(\"Nombre de transferts détectés par jour\")\n", "plt.xlabel(\"Date\")\n", "plt.ylabel(\"Nombre de transferts\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "be0708c4-95df-4915-82f9-a6347187bd70", "metadata": {}, "outputs": [], "source": [ "# Détection de jours anormaux (ex: > 95e percentile)\n", "threshold = transfers_per_day.quantile(0.95)\n", "anomalous_days = transfers_per_day[transfers_per_day > threshold]\n", "print(\"Jours anormaux (avec beaucoup de transferts) :\")\n", "display(anomalous_days)" ] }, { "cell_type": "markdown", "id": "9416dd81-8f73-4e87-a5e8-b640882fbba4", "metadata": {}, "source": [ "## Etude de la saisonalite" ] }, { "cell_type": "code", "execution_count": null, "id": "f7da5d09-7c97-4fa2-921d-886928fdf80f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n", "transf_comptee = transf_compte[transf_compte[\"Date\"].dt.year >= 2021]\n", "# Nombre de transferts par jour\n", "transfers_per_day = transf_comptee.groupby(\"Date\").size().rename(\"n_transfers\")\n", "\n", "# Détection des jours anormaux (au-dessus du 95e percentile)\n", "threshold = transfers_per_day.quantile(0.95)\n", "anomalous_days = transfers_per_day[transfers_per_day > threshold]\n", "\n", "# Ajouter weekday et month\n", "anomalous_table = anomalous_days.reset_index()\n", "anomalous_table[\"weekday\"] = anomalous_table[\"Date\"].dt.day_name()\n", "anomalous_table[\"month\"] = anomalous_table[\"Date\"].dt.month_name()\n", "\n", "pd.set_option('display.max_rows', None)\n", "print(\"Jours anormaux (weekday + month) :\")\n", "display(anomalous_table.sort_values(\"n_transfers\").tail(20))" ] }, { "cell_type": "code", "execution_count": null, "id": "1fa564f5-5844-4a1b-bdca-b392b829d734", "metadata": {}, "outputs": [], "source": [ "#Nombre total de comptes impliqués \n", "\n", "all_accounts = pd.concat([\n", " transf_compte[\"Compte sortant\"],\n", " transf_compte[\"Compte entrant\"]\n", "]).unique()\n", "\n", "print(f\"Nombre total de comptes impliqués dans au moins un transfert : {len(all_accounts)}\")\n", "\n", "# Nombre de comptes impliqués par jour \n", "\n", "accounts_per_day = transf_compte.groupby(\"Date\").agg(\n", " comptes_uniques=(\"Compte sortant\", lambda x: set(x)) # temp\n", ")\n", "\n", "# On ajoute aussi les comptes entrants\n", "accounts_per_day[\"comptes_uniques\"] = accounts_per_day.index.map(\n", " lambda d: set(transf_compte[transf_compte[\"Date\"] == d][\"Compte sortant\"]) |\n", " set(transf_compte[transf_compte[\"Date\"] == d][\"Compte entrant\"])\n", ")\n", "\n", "accounts_per_day[\"n_comptes\"] = accounts_per_day[\"comptes_uniques\"].apply(len)\n", "\n", "# Plot\n", "plt.figure(figsize=(14,5))\n", "plt.plot(accounts_per_day.index, accounts_per_day[\"n_comptes\"], marker=\"o\")\n", "plt.title(\"Nombre de comptes impliqués dans des transferts par jour\")\n", "plt.xlabel(\"Date\")\n", "plt.ylabel(\"Nombre de comptes uniques\")\n", "plt.grid(True)\n", "plt.show()\n", "\n", "print(\"Aperçu :\")\n", "accounts_per_day.head()\n" ] }, { "cell_type": "markdown", "id": "c898b0c5-0a8e-4640-bc52-9490ee80e53d", "metadata": {}, "source": [ "# MERGE AUM & FLOWS " ] }, { "cell_type": "code", "execution_count": null, "id": "ce33dbf8-1c59-416a-adc4-6eb7c1ea9d8e", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "df_aum2 = df_aum2.rename(columns={\n", " \"Registrar Account - ID\": \"Account_ID\",\n", " \"Product - Isin\": \"ISIN\",\n", " \"Centralisation Date\": \"Date\",\n", " \"Value - AUM €\": \"AUM_EUR\"\n", "})\n", "\n", "df_flows2 = df_flows2.rename(columns={\n", " \"Registrar Account - ID\": \"Account_ID\",\n", " \"Product - Isin\": \"ISIN\",\n", " \"Centralisation Date\": \"Date\",\n", " \"Value € - NetFlows\": \"Flow_EUR\"\n", "})\n", "\n", "\n", "df_aum2[\"Date\"] = pd.to_datetime(df_aum2[\"Date\"])\n", "df_flows2[\"Date\"] = pd.to_datetime(df_flows2[\"Date\"])\n", "\n", "df_aum2[\"Account_ID\"] = df_aum2[\"Account_ID\"].astype(str)\n", "df_flows2[\"Account_ID\"] = df_flows2[\"Account_ID\"].astype(str)\n", "\n", "df_aum2[\"ISIN\"] = df_aum2[\"ISIN\"].str.upper()\n", "df_flows2[\"ISIN\"] = df_flows2[\"ISIN\"].str.upper()\n", "\n", "\n", "df_merged = pd.merge(\n", " df_aum2[[\"Account_ID\", \"ISIN\", \"Date\", \"AUM_EUR\"]],\n", " df_flows2[[\"Account_ID\", \"ISIN\", \"Date\", \"Flow_EUR\"]],\n", " on=[\"Account_ID\", \"ISIN\", \"Date\"],\n", " how=\"outer\"\n", ").sort_values([\"Account_ID\", \"ISIN\", \"Date\"])\n", "\n", "print(\"Merged dataset:\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7e5d642e-5c16-4c78-8d83-075094902670", "metadata": {}, "outputs": [], "source": [ "df_merged" ] }, { "cell_type": "code", "execution_count": null, "id": "ea14866a-1ce6-4b19-9225-d725304af8ec", "metadata": {}, "outputs": [], "source": [ "# 2. HISTOGRAMME DES AUM (SANS ISIN)\n", "\n", "# We keep the mean AUM per Account \n", "aum_by_account = (\n", " df_merged.groupby(\"Account_ID\")[\"AUM_EUR\"]\n", " .mean()\n", " .dropna()\n", ")\n", "\n", "plt.figure(figsize=(10,6))\n", "plt.hist(aum_by_account, bins=50)\n", "plt.xlabel(\"Mean AUM value (€)\")\n", "plt.ylabel(\"Number of client accounts\")\n", "plt.title(\"Distribution of client average AUM (one value per Account_ID)\")\n", "plt.grid(True)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "f13c213e-7f72-494a-bcf0-b4cd9feee55d", "metadata": {}, "outputs": [], "source": [ "\n", "# 3. ANALYSE DES FLOWS POUR UN COMPTE DANS UN FONDS\n", "\n", "account_to_plot = \"YOUR_ACCOUNT_ID_HERE\"\n", "isin_to_plot = \"YOUR_ISIN_HERE\"\n", "\n", "client_flows = df_merged[\n", " (df_merged[\"Account_ID\"] == account_to_plot) &\n", " (df_merged[\"ISIN\"] == isin_to_plot)\n", "].sort_values(\"Date\")\n", "\n", "plt.figure(figsize=(12,5))\n", "plt.plot(client_flows[\"Date\"], client_flows[\"Flow_EUR\"], marker=\"o\")\n", "plt.axhline(0, color=\"black\", linewidth=1)\n", "plt.xlabel(\"Date\")\n", "plt.ylabel(\"Flow (€)\")\n", "plt.title(f\"Flow movements for Account {account_to_plot}, ISIN {isin_to_plot}\")\n", "plt.grid(True)\n", "plt.show()\n", "\n", "###############################################################################\n", "# 4. ANALYSE MENSUELLE DES FLOWS (ENTRANTS / SORTANTS)\n", "###############################################################################\n", "\n", "df_merged[\"YearMonth\"] = df_merged[\"Date\"].dt.to_period(\"M\")\n", "\n", "flows_monthly = df_merged.groupby(\"YearMonth\").agg(\n", " n_positive_flows=(\"Flow_EUR\", lambda x: (x > 0).sum()),\n", " n_negative_flows=(\"Flow_EUR\", lambda x: (x < 0).sum()),\n", ")\n", "\n", "print(\"Monthly flow summary:\")\n", "print(flows_monthly.head())\n", "\n", "# ---- Plot bar chart ----\n", "flows_monthly.index = flows_monthly.index.astype(str)\n", "\n", "plt.figure(figsize=(14,6))\n", "plt.bar(flows_monthly.index, flows_monthly[\"n_positive_flows\"], label=\"Positive flows (inflows)\", alpha=0.7)\n", "plt.bar(flows_monthly.index, flows_monthly[\"n_negative_flows\"], label=\"Negative flows (outflows)\", alpha=0.7)\n", "plt.xticks(rotation=90)\n", "plt.xlabel(\"Year-Month\")\n", "plt.ylabel(\"Number of accounts with flows\")\n", "plt.title(\"Monthly number of accounts with inflows vs outflows\")\n", "plt.legend()\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ded9b4f6-df92-479e-bc7b-aaa489dad228", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.11" } }, "nbformat": 4, "nbformat_minor": 5 }