Compare commits

...

11 Commits

12 changed files with 58902 additions and 0 deletions

File diff suppressed because one or more lines are too long

3782
Clustering_2Feb (1).ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

572
Stat_Desc1.ipynb Normal file
View File

@ -0,0 +1,572 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e637deae-9168-4fb2-b95f-4e42d8d72d9e",
"metadata": {},
"source": [
"# DATA COLLECTION "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9f99615b-5a9d-434a-baa0-dca55edf7699",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8508d94-74a7-4bb0-8b81-c2e06850c25f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"chemin_fichier = \"s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv\"\n",
"df_aum2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n",
"df_aum2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4644da13-5aea-4ca0-9fcf-947324766292",
"metadata": {},
"outputs": [],
"source": [
"chemin_fichier = \"s3://projet-bdc-data/carmignac/Flows ENSAE V2 -20251105.csv\"\n",
"df_flows2 = pd.read_csv(chemin_fichier, sep=';', engine='python')\n",
"df_flows2"
]
},
{
"cell_type": "markdown",
"id": "59d31eaf-c06c-4ebe-9f8c-cb9158a50976",
"metadata": {},
"source": [
"## DATA ANALYSIS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5773b911-6b84-448d-962f-8228eeac0250",
"metadata": {},
"outputs": [],
"source": [
"df_aum2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f571810-c373-4d30-8ca5-c3a074b95b08",
"metadata": {},
"outputs": [],
"source": [
"df_aum2.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af25fd07-a613-4adc-b88b-93a8d300379c",
"metadata": {},
"outputs": [],
"source": [
"df_flows2.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6d0fe83-2957-430b-89cf-cd30833b7cab",
"metadata": {},
"outputs": [],
"source": [
"df_flows2.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fac74b0-662f-48d0-a234-7edc3c3e86ad",
"metadata": {},
"outputs": [],
"source": [
"#dict avec valeurs unique de chaque col \n",
"rows = []\n",
"\n",
"for col in df_aum2.columns:\n",
" uniques = df_aum2[col].unique()\n",
" rows.append({\n",
" \"Colonne\": col,\n",
" \"Nbr Lignes\": df_aum2.shape[0], #4.8millions\n",
" \"Nb valeurs uniques\": len(uniques),\n",
" \"Exemples de valeurs\": uniques,\n",
" \"Nan Values\" : df_aum2[col].isna().sum()\n",
" })\n",
"\n",
"df_uniques = pd.DataFrame(rows)\n",
"df_uniques"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5de53ba3-b3db-4935-acac-435b05b909e2",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#dict avec valeurs unique de chaque col \n",
"rowsf = []\n",
"\n",
"for col in df_flows2.columns:\n",
" uniques = df_flows2[col].unique()\n",
" rowsf.append({\n",
" \"Colonne\": col,\n",
" \"Nbr Lignes\": df_flows2.shape[0], #4.8millions\n",
" \"Nb valeurs uniques\": len(uniques),\n",
" \"Exemples de valeurs\": uniques,\n",
" \"Nan Values\" : df_flows2[col].isna().sum()\n",
" })\n",
"\n",
"df_unique_flows = pd.DataFrame(rowsf)\n",
"df_unique_flows"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e07a2b28-13f7-49f6-a55b-7d09a506407b",
"metadata": {},
"outputs": [],
"source": [
"df1_aum = df_uniques[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n",
"df2_flows = df_unique_flows[['Colonne', 'Nbr Lignes', 'Nb valeurs uniques']]\n",
"\n",
"df_merged = df1_aum.merge(df2_flows, on='Colonne', suffixes=('_aum', '_flows'))\n",
"df_merged"
]
},
{
"cell_type": "markdown",
"id": "4ce2ad22-08e6-4e63-96b2-c2301172516e",
"metadata": {},
"source": [
"# ETUDE ET ANALYSE DES ANOMALIES"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c883dfc2-b9b9-4d3e-80d3-0140cd222492",
"metadata": {},
"outputs": [],
"source": [
"df_aum2['Centralisation Date'] = pd.to_datetime(df_aum2['Centralisation Date'])\n",
"df_flows2['Centralisation Date'] = pd.to_datetime(df_flows2['Centralisation Date'])\n",
"key_cols = ['Registrar Account - ID', 'Product - Isin']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f47b276d-cce6-433c-87c5-860810d71d34",
"metadata": {},
"outputs": [],
"source": [
"cols= ['Company - Id', 'Company - Ultimate Parent Id',\n",
" 'Registrar Account - ID', 'Registrar Account - Region','Product - Isin']\n",
"\n",
"doublons_aum2 = df_aum2[df_aum2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n",
"doublons_flows2 = df_flows2[df_flows2.duplicated(subset=cols + ['Centralisation Date'], keep=False)]\n",
"\n",
"print(\" Cols: \", cols)\n",
"print(\"Doublons AUM:\", doublons_aum2.shape[0])\n",
"print(\"Doublons Flows:\", doublons_flows2.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2f355e6-30c5-420e-a3db-9095dd5e0147",
"metadata": {},
"outputs": [],
"source": [
"# # Comptes avec same flux et same product ISIN mais IDs différents ---> candidats pseudo-client (FAUX)\n",
"# #plusieurs comptes diff réalisent EXACTEMENT le même flux sur le même produit le même jour.\n"
]
},
{
"cell_type": "markdown",
"id": "4b9173e1-1c01-4ef3-adcd-9c587a97dd5d",
"metadata": {},
"source": [
"## Stat Descrp FLOWS "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8df98c34-b1f7-4fb9-bbc7-c9bbe762022a",
"metadata": {},
"outputs": [],
"source": [
"df = df_flows2.copy()\n",
"df[\"Date\"] = pd.to_datetime(df[\"Centralisation Date\"])\n",
"\n",
"# Groupby par ISIN et Date\n",
"grouped = df.groupby([\"Product - Isin\", \"Date\"])\n",
"\n",
"transfers = []\n",
"\n",
"for (isin, date), group in grouped:\n",
" # Sépare flux positifs et négatifs\n",
" entrants = group[group[\"Value € - NetFlows\"] > 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n",
" sortants = group[group[\"Value € - NetFlows\"] < 0][[\"Registrar Account - ID\", \"Value € - NetFlows\"]]\n",
"\n",
" # On cherche des paires +M / -M\n",
" for _, row_sortie in sortants.iterrows():\n",
" montant_sortie = row_sortie[\"Value € - NetFlows\"]\n",
" compte_sortant = row_sortie[\"Registrar Account - ID\"]\n",
"\n",
" # Chercher un +M qui matche exactement le -M\n",
" match = entrants[entrants[\"Value € - NetFlows\"] == -montant_sortie]\n",
"\n",
" if len(match) > 0:\n",
" for _, row_entree in match.iterrows():\n",
" transfers.append({\n",
" \"ISIN\": isin,\n",
" \"Date\": date,\n",
" \"Compte sortant\": compte_sortant,\n",
" \"Montant sortie\": montant_sortie,\n",
" \"Compte entrant\": row_entree[\"Registrar Account - ID\"],\n",
" \"Montant entrée\": row_entree[\"Value € - NetFlows\"]\n",
" })\n",
"\n",
"\n",
"transf_compte = pd.DataFrame(transfers)\n",
"transf_compte\n",
"\n",
"#df initiale : 2 574 461 \n",
"# 27 880 rows "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df0c0bbb-4cff-4205-86e0-0393f83a4cc7",
"metadata": {},
"outputs": [],
"source": [
"# Extraire tous les comptes sortants et entrants\n",
"all_accounts = pd.concat([\n",
" transf_compte[\"Compte sortant\"],\n",
" transf_compte[\"Compte entrant\"]\n",
"])\n",
"\n",
"# Comptes uniques impliqués dans au moins un transfert\n",
"unique_accounts = all_accounts.unique()\n",
"print(f\"Nombre de comptes uniques impliqués dans les transferts : {len(unique_accounts)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dacddcb-74f1-441f-adbe-83275c8f9216",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n",
"\n",
"# Nombre de transferts par jour\n",
"transfers_per_day = transf_compte.groupby(\"Date\").size()\n",
"\n",
"plt.figure(figsize=(14,5))\n",
"transfers_per_day.plot(kind=\"line\")\n",
"plt.title(\"Nombre de transferts détectés par jour\")\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Nombre de transferts\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be0708c4-95df-4915-82f9-a6347187bd70",
"metadata": {},
"outputs": [],
"source": [
"# Détection de jours anormaux (ex: > 95e percentile)\n",
"threshold = transfers_per_day.quantile(0.95)\n",
"anomalous_days = transfers_per_day[transfers_per_day > threshold]\n",
"print(\"Jours anormaux (avec beaucoup de transferts) :\")\n",
"display(anomalous_days)"
]
},
{
"cell_type": "markdown",
"id": "9416dd81-8f73-4e87-a5e8-b640882fbba4",
"metadata": {},
"source": [
"## Etude de la saisonalite"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7da5d09-7c97-4fa2-921d-886928fdf80f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"transf_compte[\"Date\"] = pd.to_datetime(transf_compte[\"Date\"])\n",
"transf_comptee = transf_compte[transf_compte[\"Date\"].dt.year >= 2021]\n",
"# Nombre de transferts par jour\n",
"transfers_per_day = transf_comptee.groupby(\"Date\").size().rename(\"n_transfers\")\n",
"\n",
"# Détection des jours anormaux (au-dessus du 95e percentile)\n",
"threshold = transfers_per_day.quantile(0.95)\n",
"anomalous_days = transfers_per_day[transfers_per_day > threshold]\n",
"\n",
"# Ajouter weekday et month\n",
"anomalous_table = anomalous_days.reset_index()\n",
"anomalous_table[\"weekday\"] = anomalous_table[\"Date\"].dt.day_name()\n",
"anomalous_table[\"month\"] = anomalous_table[\"Date\"].dt.month_name()\n",
"\n",
"pd.set_option('display.max_rows', None)\n",
"print(\"Jours anormaux (weekday + month) :\")\n",
"display(anomalous_table.sort_values(\"n_transfers\").tail(20))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fa564f5-5844-4a1b-bdca-b392b829d734",
"metadata": {},
"outputs": [],
"source": [
"#Nombre total de comptes impliqués \n",
"\n",
"all_accounts = pd.concat([\n",
" transf_compte[\"Compte sortant\"],\n",
" transf_compte[\"Compte entrant\"]\n",
"]).unique()\n",
"\n",
"print(f\"Nombre total de comptes impliqués dans au moins un transfert : {len(all_accounts)}\")\n",
"\n",
"# Nombre de comptes impliqués par jour \n",
"\n",
"accounts_per_day = transf_compte.groupby(\"Date\").agg(\n",
" comptes_uniques=(\"Compte sortant\", lambda x: set(x)) # temp\n",
")\n",
"\n",
"# On ajoute aussi les comptes entrants\n",
"accounts_per_day[\"comptes_uniques\"] = accounts_per_day.index.map(\n",
" lambda d: set(transf_compte[transf_compte[\"Date\"] == d][\"Compte sortant\"]) |\n",
" set(transf_compte[transf_compte[\"Date\"] == d][\"Compte entrant\"])\n",
")\n",
"\n",
"accounts_per_day[\"n_comptes\"] = accounts_per_day[\"comptes_uniques\"].apply(len)\n",
"\n",
"# Plot\n",
"plt.figure(figsize=(14,5))\n",
"plt.plot(accounts_per_day.index, accounts_per_day[\"n_comptes\"], marker=\"o\")\n",
"plt.title(\"Nombre de comptes impliqués dans des transferts par jour\")\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Nombre de comptes uniques\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"print(\"Aperçu :\")\n",
"accounts_per_day.head()\n"
]
},
{
"cell_type": "markdown",
"id": "c898b0c5-0a8e-4640-bc52-9490ee80e53d",
"metadata": {},
"source": [
"# MERGE AUM & FLOWS "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce33dbf8-1c59-416a-adc4-6eb7c1ea9d8e",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"df_aum2 = df_aum2.rename(columns={\n",
" \"Registrar Account - ID\": \"Account_ID\",\n",
" \"Product - Isin\": \"ISIN\",\n",
" \"Centralisation Date\": \"Date\",\n",
" \"Value - AUM €\": \"AUM_EUR\"\n",
"})\n",
"\n",
"df_flows2 = df_flows2.rename(columns={\n",
" \"Registrar Account - ID\": \"Account_ID\",\n",
" \"Product - Isin\": \"ISIN\",\n",
" \"Centralisation Date\": \"Date\",\n",
" \"Value € - NetFlows\": \"Flow_EUR\"\n",
"})\n",
"\n",
"\n",
"df_aum2[\"Date\"] = pd.to_datetime(df_aum2[\"Date\"])\n",
"df_flows2[\"Date\"] = pd.to_datetime(df_flows2[\"Date\"])\n",
"\n",
"df_aum2[\"Account_ID\"] = df_aum2[\"Account_ID\"].astype(str)\n",
"df_flows2[\"Account_ID\"] = df_flows2[\"Account_ID\"].astype(str)\n",
"\n",
"df_aum2[\"ISIN\"] = df_aum2[\"ISIN\"].str.upper()\n",
"df_flows2[\"ISIN\"] = df_flows2[\"ISIN\"].str.upper()\n",
"\n",
"\n",
"df_merged = pd.merge(\n",
" df_aum2[[\"Account_ID\", \"ISIN\", \"Date\", \"AUM_EUR\"]],\n",
" df_flows2[[\"Account_ID\", \"ISIN\", \"Date\", \"Flow_EUR\"]],\n",
" on=[\"Account_ID\", \"ISIN\", \"Date\"],\n",
" how=\"outer\"\n",
").sort_values([\"Account_ID\", \"ISIN\", \"Date\"])\n",
"\n",
"print(\"Merged dataset:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e5d642e-5c16-4c78-8d83-075094902670",
"metadata": {},
"outputs": [],
"source": [
"df_merged"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea14866a-1ce6-4b19-9225-d725304af8ec",
"metadata": {},
"outputs": [],
"source": [
"# 2. HISTOGRAMME DES AUM (SANS ISIN)\n",
"\n",
"# We keep the mean AUM per Account \n",
"aum_by_account = (\n",
" df_merged.groupby(\"Account_ID\")[\"AUM_EUR\"]\n",
" .mean()\n",
" .dropna()\n",
")\n",
"\n",
"plt.figure(figsize=(10,6))\n",
"plt.hist(aum_by_account, bins=50)\n",
"plt.xlabel(\"Mean AUM value (€)\")\n",
"plt.ylabel(\"Number of client accounts\")\n",
"plt.title(\"Distribution of client average AUM (one value per Account_ID)\")\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f13c213e-7f72-494a-bcf0-b4cd9feee55d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 3. ANALYSE DES FLOWS POUR UN COMPTE DANS UN FONDS\n",
"\n",
"account_to_plot = \"YOUR_ACCOUNT_ID_HERE\"\n",
"isin_to_plot = \"YOUR_ISIN_HERE\"\n",
"\n",
"client_flows = df_merged[\n",
" (df_merged[\"Account_ID\"] == account_to_plot) &\n",
" (df_merged[\"ISIN\"] == isin_to_plot)\n",
"].sort_values(\"Date\")\n",
"\n",
"plt.figure(figsize=(12,5))\n",
"plt.plot(client_flows[\"Date\"], client_flows[\"Flow_EUR\"], marker=\"o\")\n",
"plt.axhline(0, color=\"black\", linewidth=1)\n",
"plt.xlabel(\"Date\")\n",
"plt.ylabel(\"Flow (€)\")\n",
"plt.title(f\"Flow movements for Account {account_to_plot}, ISIN {isin_to_plot}\")\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"###############################################################################\n",
"# 4. ANALYSE MENSUELLE DES FLOWS (ENTRANTS / SORTANTS)\n",
"###############################################################################\n",
"\n",
"df_merged[\"YearMonth\"] = df_merged[\"Date\"].dt.to_period(\"M\")\n",
"\n",
"flows_monthly = df_merged.groupby(\"YearMonth\").agg(\n",
" n_positive_flows=(\"Flow_EUR\", lambda x: (x > 0).sum()),\n",
" n_negative_flows=(\"Flow_EUR\", lambda x: (x < 0).sum()),\n",
")\n",
"\n",
"print(\"Monthly flow summary:\")\n",
"print(flows_monthly.head())\n",
"\n",
"# ---- Plot bar chart ----\n",
"flows_monthly.index = flows_monthly.index.astype(str)\n",
"\n",
"plt.figure(figsize=(14,6))\n",
"plt.bar(flows_monthly.index, flows_monthly[\"n_positive_flows\"], label=\"Positive flows (inflows)\", alpha=0.7)\n",
"plt.bar(flows_monthly.index, flows_monthly[\"n_negative_flows\"], label=\"Negative flows (outflows)\", alpha=0.7)\n",
"plt.xticks(rotation=90)\n",
"plt.xlabel(\"Year-Month\")\n",
"plt.ylabel(\"Number of accounts with flows\")\n",
"plt.title(\"Monthly number of accounts with inflows vs outflows\")\n",
"plt.legend()\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ded9b4f6-df92-479e-bc7b-aaa489dad228",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

756
Stat_Desc2.ipynb Normal file

File diff suppressed because one or more lines are too long

6776
clus11mars-Copy1 (5).ipynb Normal file

File diff suppressed because one or more lines are too long

1724
data_1-Copy1.ipynb Normal file

File diff suppressed because one or more lines are too long

3658
data_1.ipynb Normal file

File diff suppressed because one or more lines are too long