Project_Carmignac/Stat_Desc1.ipynb

573 lines
17 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "e637deae-9168-4fb2-b95f-4e42d8d72d9e",
"metadata": {},
"source": [
"# DATA COLLECTION "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9f99615b-5a9d-434a-baa0-dca55edf7699",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8508d94-74a7-4bb0-8b81-c2e06850c25f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"chemin_fichier = \"s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv\"\n",
"df_aum2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n",
"df_aum2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4644da13-5aea-4ca0-9fcf-947324766292",
"metadata": {},
"outputs": [],
"source": [
"chemin_fichier = \"s3://projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\"\n",
"df_flows2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n",
"df_flows2"
]
},
{
"cell_type": "markdown",
"id": "59d31eaf-c06c-4ebe-9f8c-cb9158a50976",
"metadata": {},
"source": [
"## DATA ANALYSIS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5773b911-6b84-448d-962f-8228eeac0250",
"metadata": {},
"outputs": [],
"source": [
"df_aum2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f571810-c373-4d30-8ca5-c3a074b95b08",
"metadata": {},
"outputs": [],
"source": [
"df_aum2.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af25fd07-a613-4adc-b88b-93a8d300379c",
"metadata": {},
"outputs": [],
"source": [
"df_flows2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6d0fe83-2957-430b-89cf-cd30833b7cab",
"metadata": {},
"outputs": [],
"source": [
"df_flows2.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fac74b0-662f-48d0-a234-7edc3c3e86ad",
"metadata": {},
"outputs": [],
"source": [
"#dict avec valeurs unique de chaque col \n",
"rows = []\n",
"\n",
"for col in df_aum2.columns:\n",
" uniques = df_aum2[col].unique()\n",
" rows.append({\n",
" \"Colonne\": col,\n",
" \"Nbr Lignes\": df_aum2.shape[0], #4.8millions\n",
" \"Nb valeurs uniques\": len(uniques),\n",
" \"Exemples de valeurs\": uniques,\n",
" \"Nan Values\" : df_aum2[col].isna().sum()\n",
" })\n",
"\n",
"df_uniques = pd.DataFrame(rows)\n",
"df_uniques"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5de53ba3-b3db-4935-acac-435b05b909e2",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#dict avec valeurs unique de chaque col \n",
"rowsf = []\n",
"\n",
"for col in df_flows2.columns:\n",
" uniques = df_flows2[col].unique()\n",
" rowsf.append({\n",
" \"Colonne\": col,\n",
" \"Nbr Lignes\": df_flows2.shape[0], #4.8millions\n",
" \"Nb valeurs uniques\": len(uniques),\n",
" \"Exemples de valeurs\": uniques,\n",
" \"Nan Values\" : df_flows2[col].isna().sum()\n",
" })\n",
"\n",
"df_unique_flows = pd.DataFrame(rowsf)\n",
"df_unique_flows"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e07a2b28-13f7-49f6-a55b-7d09a506407b",
"metadata": {},
"outputs": [],
"source": [
"df1_aum = df_uniques[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n",
"df2_flows = df_unique_flows[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n",
"\n",
"df_merged = df1_aum.merge(df2_flows, on='Colonne', suffixes=('_aum', '_flows'))\n",
"df_merged"
]
},
{
"cell_type": "markdown",
"id": "4ce2ad22-08e6-4e63-96b2-c2301172516e",
"metadata": {},
"source": [
"# ETUDE ET ANALYSE DES ANOMALIES"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c883dfc2-b9b9-4d3e-80d3-0140cd222492",
"metadata": {},
"outputs": [],
"source": [
"df_aum2['Centralisation Date'] = pd.to_datetime(df_aum2['Centralisation Date'])\n",
"df_flows2['Centralisation Date'] = pd.to_datetime(df_flows2['Centralisation Date'])\n",
"key_cols = ['Registrar Account - ID', 'Product - Isin']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f47b276d-cce6-433c-87c5-860810d71d34",
"metadata": {},
"outputs": [],
"source": [
"cols= ['Company - Id', 'Company - Ultimate Parent Id',\n",
" 'Registrar Account - ID', 'Registrar Account - Region','Product - Isin']\n",
"\n",
"doublons_aum2 = df_aum2[df_aum2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n",
"doublons_flows2 = df_flows2[df_flows2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n",
"\n",
"print(\" Cols: \", cols)\n",
"print(\"Doublons AUM:\", doublons_aum2.shape[0])\n",
"print(\"Doublons Flows:\", doublons_flows2.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2f355e6-30c5-420e-a3db-9095dd5e0147",
"metadata": {},
"outputs": [],
"source": [
"# # Comptes avec same flux et same product ISIN mais IDs différents ---> candidats pseudo-client (FAUX)\n",
"# #plusieurs comptes diff réalisent EXACTEMENT le même flux sur le même produit le même jour.\n"
]
},
{
"cell_type": "markdown",
"id": "4b9173e1-1c01-4ef3-adcd-9c587a97dd5d",
"metadata": {},
"source": [
"## Stat Descrp FLOWS "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8df98c34-b1f7-4fb9-bbc7-c9bbe762022a",
"metadata": {},
"outputs": [],
"source": [
"df = df_flows2.copy()\n",
"df[\"Date\"] = pd.to_datetime(df[\"Centralisation Date\"])\n",
"\n",
"# Groupby par ISIN et Date\n",
"grouped = df.groupby([\"Product - Isin\", \"Date\"])\n",
"\n",
"transfers = []\n",
"\n",
"for (isin, date), group in grouped:\n",
" # Sépare flux positifs et négatifs\n",
" entrants = group[group[\"Value € - NetFlows\"] > 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n",
" sortants = group[group[\"Value € - NetFlows\"] < 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n",
"\n",
" # On cherche des paires +M / -M\n",
" for _, row_sortie in sortants.iterrows():\n",
" montant_sortie = row_sortie[\"Value € - NetFlows\"]\n",
" compte_sortant = row_sortie[\"Registrar Account - ID\"]\n",
"\n",
" # Chercher un +M qui matche exactement le -M\n",
" match = entrants[entrants[\"Value € - NetFlows\"] == -montant_sortie]\n",
"\n",
" if len(match) > 0:\n",
" for _, row_entree in match.iterrows():\n",
" transfers.append({\n",
" \"ISIN\": isin,\n",
" \"Date\": date,\n",
" \"Compte sortant\": compte_sortant,\n",
" \"Montant sortie\": montant_sortie,\n",
" \"Compte entrant\": row_entree[\"Registrar Account - ID\"],\n",
" \"Montant entrée\": row_entree[\"Value € - NetFlows\"]\n",
" })\n",
"\n",
"\n",
"transf_compte = pd.DataFrame(transfers)\n",
"transf_compte\n",
"\n",
"#df initiale : 2 574 461 \n",
"# 27 880 rows "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df0c0bbb-4cff-4205-86e0-0393f83a4cc7",
"metadata": {},
"outputs": [],
"source": [
"# Extraire tous les comptes sortants et entrants\n",
"all_accounts = pd.concat([\n",
" transf_compte[\"Compte sortant\"],\n",
" transf_compte[\"Compte entrant\"]\n",
"])\n",
"\n",
"# Comptes uniques impliqués dans au moins un transfert\n",
"unique_accounts = all_accounts.unique()\n",
"print(f\"Nombre de comptes uniques impliqués dans les transferts : {len(unique_accounts)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dacddcb-74f1-441f-adbe-83275c8f9216",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n",
"\n",
"# Nombre de transferts par jour\n",
"transfers_per_day = transf_compte.groupby(\"Date\").size()\n",
"\n",
"plt.figure(figsize=(14,5))\n",
"transfers_per_day.plot(kind=\"line\")\n",
"plt.title(\"Nombre de transferts détectés par jour\")\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Nombre de transferts\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be0708c4-95df-4915-82f9-a6347187bd70",
"metadata": {},
"outputs": [],
"source": [
"# Détection de jours anormaux (ex: > 95e percentile)\n",
"threshold = transfers_per_day.quantile(0.95)\n",
"anomalous_days = transfers_per_day[transfers_per_day > threshold]\n",
"print(\"Jours anormaux (avec beaucoup de transferts) :\")\n",
"display(anomalous_days)"
]
},
{
"cell_type": "markdown",
"id": "9416dd81-8f73-4e87-a5e8-b640882fbba4",
"metadata": {},
"source": [
"## Etude de la saisonalite"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7da5d09-7c97-4fa2-921d-886928fdf80f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n",
"transf_comptee = transf_compte[transf_compte[\"Date\"].dt.year >= 2021]\n",
"# Nombre de transferts par jour\n",
"transfers_per_day = transf_comptee.groupby(\"Date\").size().rename(\"n_transfers\")\n",
"\n",
"# Détection des jours anormaux (au-dessus du 95e percentile)\n",
"threshold = transfers_per_day.quantile(0.95)\n",
"anomalous_days = transfers_per_day[transfers_per_day > threshold]\n",
"\n",
"# Ajouter weekday et month\n",
"anomalous_table = anomalous_days.reset_index()\n",
"anomalous_table[\"weekday\"] = anomalous_table[\"Date\"].dt.day_name()\n",
"anomalous_table[\"month\"] = anomalous_table[\"Date\"].dt.month_name()\n",
"\n",
"pd.set_option('display.max_rows', None)\n",
"print(\"Jours anormaux (weekday + month) :\")\n",
"display(anomalous_table.sort_values(\"n_transfers\").tail(20))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fa564f5-5844-4a1b-bdca-b392b829d734",
"metadata": {},
"outputs": [],
"source": [
"#Nombre total de comptes impliqués \n",
"\n",
"all_accounts = pd.concat([\n",
" transf_compte[\"Compte sortant\"],\n",
" transf_compte[\"Compte entrant\"]\n",
"]).unique()\n",
"\n",
"print(f\"Nombre total de comptes impliqués dans au moins un transfert : {len(all_accounts)}\")\n",
"\n",
"# Nombre de comptes impliqués par jour \n",
"\n",
"accounts_per_day = transf_compte.groupby(\"Date\").agg(\n",
" comptes_uniques=(\"Compte sortant\", lambda x: set(x)) # temp\n",
")\n",
"\n",
"# On ajoute aussi les comptes entrants\n",
"accounts_per_day[\"comptes_uniques\"] = accounts_per_day.index.map(\n",
" lambda d: set(transf_compte[transf_compte[\"Date\"] == d][\"Compte sortant\"]) |\n",
" set(transf_compte[transf_compte[\"Date\"] == d][\"Compte entrant\"])\n",
")\n",
"\n",
"accounts_per_day[\"n_comptes\"] = accounts_per_day[\"comptes_uniques\"].apply(len)\n",
"\n",
"# Plot\n",
"plt.figure(figsize=(14,5))\n",
"plt.plot(accounts_per_day.index, accounts_per_day[\"n_comptes\"], marker=\"o\")\n",
"plt.title(\"Nombre de comptes impliqués dans des transferts par jour\")\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Nombre de comptes uniques\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"print(\"Aperçu :\")\n",
"accounts_per_day.head()\n"
]
},
{
"cell_type": "markdown",
"id": "c898b0c5-0a8e-4640-bc52-9490ee80e53d",
"metadata": {},
"source": [
"# MERGE AUM & FLOWS "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce33dbf8-1c59-416a-adc4-6eb7c1ea9d8e",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"df_aum2 = df_aum2.rename(columns={\n",
" \"Registrar Account - ID\": \"Account_ID\",\n",
" \"Product - Isin\": \"ISIN\",\n",
" \"Centralisation Date\": \"Date\",\n",
" \"Value - AUM €\": \"AUM_EUR\"\n",
"})\n",
"\n",
"df_flows2 = df_flows2.rename(columns={\n",
" \"Registrar Account - ID\": \"Account_ID\",\n",
" \"Product - Isin\": \"ISIN\",\n",
" \"Centralisation Date\": \"Date\",\n",
" \"Value € - NetFlows\": \"Flow_EUR\"\n",
"})\n",
"\n",
"\n",
"df_aum2[\"Date\"] = pd.to_datetime(df_aum2[\"Date\"])\n",
"df_flows2[\"Date\"] = pd.to_datetime(df_flows2[\"Date\"])\n",
"\n",
"df_aum2[\"Account_ID\"] = df_aum2[\"Account_ID\"].astype(str)\n",
"df_flows2[\"Account_ID\"] = df_flows2[\"Account_ID\"].astype(str)\n",
"\n",
"df_aum2[\"ISIN\"] = df_aum2[\"ISIN\"].str.upper()\n",
"df_flows2[\"ISIN\"] = df_flows2[\"ISIN\"].str.upper()\n",
"\n",
"\n",
"df_merged = pd.merge(\n",
" df_aum2[[\"Account_ID\", \"ISIN\", \"Date\", \"AUM_EUR\"]],\n",
" df_flows2[[\"Account_ID\", \"ISIN\", \"Date\", \"Flow_EUR\"]],\n",
" on=[\"Account_ID\", \"ISIN\", \"Date\"],\n",
" how=\"outer\"\n",
").sort_values([\"Account_ID\", \"ISIN\", \"Date\"])\n",
"\n",
"print(\"Merged dataset:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e5d642e-5c16-4c78-8d83-075094902670",
"metadata": {},
"outputs": [],
"source": [
"df_merged"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea14866a-1ce6-4b19-9225-d725304af8ec",
"metadata": {},
"outputs": [],
"source": [
"# 2. HISTOGRAMME DES AUM (SANS ISIN)\n",
"\n",
"# We keep the mean AUM per Account \n",
"aum_by_account = (\n",
" df_merged.groupby(\"Account_ID\")[\"AUM_EUR\"]\n",
" .mean()\n",
" .dropna()\n",
")\n",
"\n",
"plt.figure(figsize=(10,6))\n",
"plt.hist(aum_by_account, bins=50)\n",
"plt.xlabel(\"Mean AUM value (€)\")\n",
"plt.ylabel(\"Number of client accounts\")\n",
"plt.title(\"Distribution of client average AUM (one value per Account_ID)\")\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f13c213e-7f72-494a-bcf0-b4cd9feee55d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 3. ANALYSE DES FLOWS POUR UN COMPTE DANS UN FONDS\n",
"\n",
"account_to_plot = \"YOUR_ACCOUNT_ID_HERE\"\n",
"isin_to_plot = \"YOUR_ISIN_HERE\"\n",
"\n",
"client_flows = df_merged[\n",
" (df_merged[\"Account_ID\"] == account_to_plot) &\n",
" (df_merged[\"ISIN\"] == isin_to_plot)\n",
"].sort_values(\"Date\")\n",
"\n",
"plt.figure(figsize=(12,5))\n",
"plt.plot(client_flows[\"Date\"], client_flows[\"Flow_EUR\"], marker=\"o\")\n",
"plt.axhline(0, color=\"black\", linewidth=1)\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Flow (€)\")\n",
"plt.title(f\"Flow movements for Account {account_to_plot}, ISIN {isin_to_plot}\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"###############################################################################\n",
"# 4. ANALYSE MENSUELLE DES FLOWS (ENTRANTS / SORTANTS)\n",
"###############################################################################\n",
"\n",
"df_merged[\"YearMonth\"] = df_merged[\"Date\"].dt.to_period(\"M\")\n",
"\n",
"flows_monthly = df_merged.groupby(\"YearMonth\").agg(\n",
" n_positive_flows=(\"Flow_EUR\", lambda x: (x > 0).sum()),\n",
" n_negative_flows=(\"Flow_EUR\", lambda x: (x < 0).sum()),\n",
")\n",
"\n",
"print(\"Monthly flow summary:\")\n",
"print(flows_monthly.head())\n",
"\n",
"# ---- Plot bar chart ----\n",
"flows_monthly.index = flows_monthly.index.astype(str)\n",
"\n",
"plt.figure(figsize=(14,6))\n",
"plt.bar(flows_monthly.index, flows_monthly[\"n_positive_flows\"], label=\"Positive flows (inflows)\", alpha=0.7)\n",
"plt.bar(flows_monthly.index, flows_monthly[\"n_negative_flows\"], label=\"Negative flows (outflows)\", alpha=0.7)\n",
"plt.xticks(rotation=90)\n",
"plt.xlabel(\"Year-Month\")\n",
"plt.ylabel(\"Number of accounts with flows\")\n",
"plt.title(\"Monthly number of accounts with inflows vs outflows\")\n",
"plt.legend()\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ded9b4f6-df92-479e-bc7b-aaa489dad228",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}