early clustering results
This commit is contained in:
parent
2bb874f58a
commit
c0add79e19
Binary file not shown.
Binary file not shown.
17239
clustering/client_clusters.csv
Normal file
17239
clustering/client_clusters.csv
Normal file
File diff suppressed because it is too large
Load Diff
|
|
@ -1,14 +1,23 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import s3fs
|
||||
fs = s3fs.S3FileSystem(
|
||||
client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
|
||||
key = os.environ["AWS_ACCESS_KEY_ID"],
|
||||
secret = os.environ["AWS_SECRET_ACCESS_KEY"],
|
||||
token = os.environ["AWS_SESSION_TOKEN"])
|
||||
|
||||
def load_and_clean_data(flows_path, aum_path, rates_path, gov_path):
|
||||
def load_and_clean_data(rates_path, gov_path):
|
||||
"""
|
||||
Loads raw CSVs and parses dates for consistent time-series analysis.
|
||||
"""
|
||||
with fs.open('s3://projet-bdc-carmignac-g3/AUM_repaired.csv', 'rb') as f:
|
||||
aum = pd.read_csv(f, sep =",")
|
||||
|
||||
with fs.open('s3://projet-bdc-carmignac-g3/flows.csv', 'rb') as f:
|
||||
flows = pd.read_csv(f, sep =",")
|
||||
|
||||
flows = pd.read_csv(flows_path)
|
||||
flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
|
||||
|
||||
aum = pd.read_csv(aum_path)
|
||||
aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
|
||||
|
||||
rates = pd.read_csv(rates_path)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
import numpy as np
|
||||
|
||||
def compute_static_features(flows_df, aum_df):
|
||||
"""Generates descriptive features from Flows and AUM."""
|
||||
|
|
@ -29,16 +30,17 @@ def compute_static_features(flows_df, aum_df):
|
|||
aum_volatility=('Value - AUM €', 'std')
|
||||
)
|
||||
|
||||
# Merge
|
||||
# Merge all static features
|
||||
features = flow_stats.join(asset_pct).join(aum_stats, how='outer').fillna(0)
|
||||
return features
|
||||
|
||||
def compute_market_sensitivities(flows_df, rates_df, gov_df, freq='M'):
|
||||
def compute_market_sensitivities(flows_df, aum_df, rates_df, gov_df, freq='M'):
|
||||
"""
|
||||
Computes Beta sensitivity to Rates and Gov Bonds.
|
||||
Freq: 'M' (Monthly) recommended for long history.
|
||||
Filters clients based on Activity (>5 txns) and Wealth (>0 AUM).
|
||||
"""
|
||||
# 1. Prepare Market Factors
|
||||
|
||||
# --- 1. Prepare Market Factors ---
|
||||
# Resample Rates (Take last value of period)
|
||||
rates_res = rates_df.set_index('Date').resample(freq)['Yld to Maturity'].last()
|
||||
delta_rates = rates_res.diff().rename('Delta_Rate')
|
||||
|
|
@ -52,23 +54,39 @@ def compute_market_sensitivities(flows_df, rates_df, gov_df, freq='M'):
|
|||
|
||||
market_factors = pd.concat([delta_rates, gov_res], axis=1).dropna()
|
||||
|
||||
# 2. Prepare Client Flows (Aggregated by same frequency)
|
||||
flows_df['Period'] = flows_df['Centralisation Date'].dt.to_period(freq).dt.to_timestamp()
|
||||
# --- 2. Identify Eligible Clients (Funneling) ---
|
||||
# Criterion A: Wealthy enough (Mean AUM > 1000 EUR to avoid division by zero)
|
||||
mean_aum = aum_df.groupby('Registrar Account - ID')['Value - AUM €'].mean()
|
||||
valid_aum_clients = mean_aum[mean_aum > 1000].index
|
||||
|
||||
# Criterion B: Active enough (Transactions >= 6)
|
||||
txn_counts = flows_df['Registrar Account - ID'].value_counts()
|
||||
active_clients = txn_counts[txn_counts >= 6].index
|
||||
|
||||
# Intersection: Clients who are BOTH wealthy enough and active enough
|
||||
eligible_clients = list(set(valid_aum_clients) & set(active_clients))
|
||||
|
||||
print(f"Regression Funnel: {len(eligible_clients)} clients eligible out of {len(mean_aum)} total accounts.")
|
||||
|
||||
# --- 3. Run Regression on Eligible Clients ---
|
||||
flows_df['Period'] = flows_df['Centralisation Date'].dt.to_period(freq).dt.to_timestamp()
|
||||
client_betas = []
|
||||
|
||||
# Only analyze clients with sufficient activity (>5 transactions)
|
||||
active_clients = flows_df['Registrar Account - ID'].value_counts()
|
||||
active_clients = active_clients[active_clients >= 5].index
|
||||
|
||||
for client in active_clients:
|
||||
for client in eligible_clients:
|
||||
# Get Client Flows
|
||||
c_flows = flows_df[flows_df['Registrar Account - ID'] == client]
|
||||
c_ts = c_flows.groupby('Period')['Quantity - NetFlows'].sum()
|
||||
|
||||
# Merge with market data
|
||||
merged = pd.concat([c_ts, market_factors], axis=1, join='inner')
|
||||
|
||||
if len(merged) >= 5:
|
||||
Y = merged['Quantity - NetFlows']
|
||||
# Ensure we have enough data points for a valid regression
|
||||
if len(merged) >= 6:
|
||||
# Normalize Y by Client's Average AUM (Approximation of AUM_{t-1})
|
||||
# Y = NetFlow_t / Mean_AUM_i
|
||||
client_avg_wealth = mean_aum.loc[client]
|
||||
Y = merged['Quantity - NetFlows'] / client_avg_wealth
|
||||
|
||||
X = merged[['Delta_Rate', 'Bond_Return']]
|
||||
X = sm.add_constant(X)
|
||||
|
||||
|
|
@ -76,6 +94,7 @@ def compute_market_sensitivities(flows_df, rates_df, gov_df, freq='M'):
|
|||
model = sm.OLS(Y, X).fit()
|
||||
client_betas.append({
|
||||
'Registrar Account - ID': client,
|
||||
'alpha': model.params.get('const', 0), # Intercept (Autonomous Trend)
|
||||
'beta_rate': model.params.get('Delta_Rate', 0),
|
||||
'beta_bond': model.params.get('Bond_Return', 0),
|
||||
'r_squared': model.rsquared
|
||||
|
|
@ -84,6 +103,6 @@ def compute_market_sensitivities(flows_df, rates_df, gov_df, freq='M'):
|
|||
continue
|
||||
|
||||
if not client_betas:
|
||||
return pd.DataFrame(columns=['Registrar Account - ID', 'beta_rate', 'beta_bond', 'r_squared'])
|
||||
return pd.DataFrame(columns=['Registrar Account - ID', 'alpha', 'beta_rate', 'beta_bond', 'r_squared'])
|
||||
|
||||
return pd.DataFrame(client_betas).set_index('Registrar Account - ID')
|
||||
|
|
@ -8,10 +8,8 @@ def main():
|
|||
|
||||
print("Loading data...")
|
||||
flows, aum, rates, gov = load_and_clean_data(
|
||||
'data/flows_sample.csv',
|
||||
'data/aum_sample.csv',
|
||||
'data/str_rates.csv',
|
||||
'data/eur_gov_indices.csv'
|
||||
rates_path='data/str_rates.csv',
|
||||
gov_path='data/eur_gov_indices.csv'
|
||||
)
|
||||
|
||||
print("Computing static features...")
|
||||
|
|
@ -20,7 +18,7 @@ def main():
|
|||
print("Computing market sensitivities (Betas)...")
|
||||
# Use 'W' (Weekly) to maximize points for the sample.
|
||||
# Use 'M' (Monthly) for the full dataset.
|
||||
sensitivity_feats = compute_market_sensitivities(flows, rates, gov, freq='W')
|
||||
sensitivity_feats = compute_market_sensitivities(flows, aum, rates, gov, freq='W')
|
||||
|
||||
full_features = static_feats.join(sensitivity_feats, how='left')
|
||||
|
||||
|
|
|
|||
1346
data/explore.ipynb
Normal file
1346
data/explore.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user