{ "cells": [ { "cell_type": "markdown", "id": "e637deae-9168-4fb2-b95f-4e42d8d72d9e", "metadata": {}, "source": [ "# DATA COLLECTION " ] }, { "cell_type": "code", "execution_count": null, "id": "f8508d94-74a7-4bb0-8b81-c2e06850c25f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "chemin_fichier = \"s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv\"\n", "df_aum2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n", "df_aum2" ] }, { "cell_type": "code", "execution_count": null, "id": "4644da13-5aea-4ca0-9fcf-947324766292", "metadata": {}, "outputs": [], "source": [ "chemin_fichier = \"s3://projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\"\n", "df_flows2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n", "df_flows2" ] }, { "cell_type": "markdown", "id": "59d31eaf-c06c-4ebe-9f8c-cb9158a50976", "metadata": {}, "source": [ "## DATA ANALYSIS" ] }, { "cell_type": "code", "execution_count": null, "id": "5773b911-6b84-448d-962f-8228eeac0250", "metadata": {}, "outputs": [], "source": [ "df_aum2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "6f571810-c373-4d30-8ca5-c3a074b95b08", "metadata": {}, "outputs": [], "source": [ "df_aum2.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "af25fd07-a613-4adc-b88b-93a8d300379c", "metadata": {}, "outputs": [], "source": [ "df_flows2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "c6d0fe83-2957-430b-89cf-cd30833b7cab", "metadata": {}, "outputs": [], "source": [ "df_flows2.columns" ] }, { "cell_type": "markdown", "id": "4ce2ad22-08e6-4e63-96b2-c2301172516e", "metadata": {}, "source": [ "# ETUDE ET ANALYSE DES ANOMALIES" ] }, { "cell_type": "code", "execution_count": null, "id": "c883dfc2-b9b9-4d3e-80d3-0140cd222492", "metadata": {}, "outputs": [], "source": [ "df_aum2['Centralisation Date'] = pd.to_datetime(df_aum2['Centralisation Date'])\n", "df_flows2['Centralisation Date'] = pd.to_datetime(df_flows2['Centralisation Date'])\n", "\n", "\n", "key_cols = ['Registrar Account - ID', 'Product - Isin']" ] }, { "cell_type": "code", "execution_count": null, "id": "64ca9883-372b-4a5e-8fc1-10e5d835c411", "metadata": {}, "outputs": [], "source": [ "# Vérifier doublons exacts\n", "doublons_aum = df_aum2[df_aum2.duplicated(subset=key_cols + ['Centralisation Date'], keep=False)]\n", "doublons_flows = df_flows2[df_flows2.duplicated(subset=key_cols + ['Centralisation Date'], keep=False)]\n", "\n", "print(\"Doublons AUM:\", doublons_aum.shape[0])\n", "print(\"Doublons Flows:\", doublons_flows.shape[0])\n", "\n", "#same date, code isin du produit, et account ==> revoir les autres caracteristiques " ] }, { "cell_type": "code", "execution_count": null, "id": "f47b276d-cce6-433c-87c5-860810d71d34", "metadata": {}, "outputs": [], "source": [ "cols= ['Company - Id', 'Company - Ultimate Parent Id',\n", " 'Registrar Account - ID', 'Registrar Account - Region','Product - Isin']\n", "\n", "doublons_aum2 = df_aum2[df_aum2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n", "doublons_flows2 = df_flows2[df_flows2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n", "\n", "print(\" Cols: \", cols)\n", "print(\"Doublons AUM:\", doublons_aum2.shape[0])\n", "print(\"Doublons Flows:\", doublons_flows2.shape[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "8df98c34-b1f7-4fb9-bbc7-c9bbe762022a", "metadata": {}, "outputs": [], "source": [ "df = df_flows2.copy()\n", "df[\"Date\"] = pd.to_datetime(df[\"Centralisation Date\"])\n", "\n", "# Groupby par ISIN et Date\n", "grouped = df.groupby([\"Product - Isin\", \"Date\"])\n", "\n", "transfers = []\n", "\n", "for (isin, date), group in grouped:\n", " # Sépare flux positifs et négatifs\n", " entrants = group[group[\"Value € - NetFlows\"] > 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n", " sortants = group[group[\"Value € - NetFlows\"] < 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n", "\n", " # On cherche des paires +M / -M\n", " for _, row_sortie in sortants.iterrows():\n", " montant_sortie = row_sortie[\"Value € - NetFlows\"]\n", " compte_sortant = row_sortie[\"Registrar Account - ID\"]\n", "\n", " # Chercher un +M qui matche exactement le -M\n", " match = entrants[entrants[\"Value € - NetFlows\"] == -montant_sortie]\n", "\n", " if len(match) > 0:\n", " for _, row_entree in match.iterrows():\n", " transfers.append({\n", " \"ISIN\": isin,\n", " \"Date\": date,\n", " \"Compte sortant\": compte_sortant,\n", " \"Montant sortie\": montant_sortie,\n", " \"Compte entrant\": row_entree[\"Registrar Account - ID\"],\n", " \"Montant entrée\": row_entree[\"Value € - NetFlows\"]\n", " })\n", "\n", "\n", "transf_compte = pd.DataFrame(transfers)\n", "transf_compte" ] }, { "cell_type": "markdown", "id": "c898b0c5-0a8e-4640-bc52-9490ee80e53d", "metadata": {}, "source": [ "# MERGE AND ANALYSIS" ] }, { "cell_type": "code", "execution_count": 8, "id": "ce33dbf8-1c59-416a-adc4-6eb7c1ea9d8e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merged dataset:\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "df_aum2 = df_aum2.rename(columns={\n", " \"Registrar Account - ID\": \"Account_ID\",\n", " \"Product - Isin\": \"ISIN\",\n", " \"Centralisation Date\": \"Date\",\n", " \"Value - AUM €\": \"AUM_EUR\"\n", "})\n", "\n", "df_flows2 = df_flows2.rename(columns={\n", " \"Registrar Account - ID\": \"Account_ID\",\n", " \"Product - Isin\": \"ISIN\",\n", " \"Centralisation Date\": \"Date\",\n", " \"Value € - NetFlows\": \"Flow_EUR\"\n", "})\n", "\n", "\n", "df_aum2[\"Date\"] = pd.to_datetime(df_aum2[\"Date\"])\n", "df_flows2[\"Date\"] = pd.to_datetime(df_flows2[\"Date\"])\n", "\n", "df_aum2[\"Account_ID\"] = df_aum2[\"Account_ID\"].astype(str)\n", "df_flows2[\"Account_ID\"] = df_flows2[\"Account_ID\"].astype(str)\n", "\n", "df_aum2[\"ISIN\"] = df_aum2[\"ISIN\"].str.upper()\n", "df_flows2[\"ISIN\"] = df_flows2[\"ISIN\"].str.upper()\n", "\n", "\n", "df_merged = pd.merge(\n", " df_aum2[[\"Account_ID\", \"ISIN\", \"Date\", \"AUM_EUR\"]],\n", " df_flows2[[\"Account_ID\", \"ISIN\", \"Date\", \"Flow_EUR\"]],\n", " on=[\"Account_ID\", \"ISIN\", \"Date\"],\n", " how=\"outer\"\n", ").sort_values([\"Account_ID\", \"ISIN\", \"Date\"])\n", "\n", "print(\"Merged dataset:\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "7e5d642e-5c16-4c78-8d83-075094902670", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Account_ID | \n", "ISIN | \n", "Date | \n", "AUM_EUR | \n", "Flow_EUR | \n", "
|---|---|---|---|---|---|
| 0 | \n", "100000014 | \n", "LU0553415323 | \n", "2015-01-31 | \n", "0.000000e+00 | \n", "NaN | \n", "
| 1 | \n", "100000014 | \n", "LU0553415323 | \n", "2015-02-28 | \n", "0.000000e+00 | \n", "NaN | \n", "
| 2 | \n", "100000014 | \n", "LU0553415323 | \n", "2015-03-31 | \n", "0.000000e+00 | \n", "NaN | \n", "
| 3 | \n", "100000014 | \n", "LU0553415323 | \n", "2015-04-30 | \n", "0.000000e+00 | \n", "NaN | \n", "
| 4 | \n", "100000014 | \n", "LU0553415323 | \n", "2015-05-31 | \n", "0.000000e+00 | \n", "NaN | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 7369446 | \n", "Private Client | \n", "LU2809794220 | \n", "2025-10-30 | \n", "NaN | \n", "-1623.71 | \n", "
| 7369447 | \n", "Private Client | \n", "LU2809794220 | \n", "2025-10-31 | \n", "4.438147e+06 | \n", "4946.23 | \n", "
| 7369448 | \n", "Private Client | \n", "LU2809794576 | \n", "2025-09-23 | \n", "NaN | \n", "71660.14 | \n", "
| 7369449 | \n", "Private Client | \n", "LU2809794576 | \n", "2025-09-30 | \n", "7.094499e+04 | \n", "NaN | \n", "
| 7369450 | \n", "Private Client | \n", "LU2809794576 | \n", "2025-10-31 | \n", "7.871629e+04 | \n", "NaN | \n", "
7369451 rows × 5 columns
\n", "