Project_Carmignac/clustering/clustering.py

29 lines
994 B
Python
Raw Normal View History

2026-02-02 11:37:16 +01:00
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
def run_clustering_pipeline(feature_df, n_clusters=4):
"""
Scales features and clusters clients.
"""
# 1. Preprocessing
# Fill missing sensitivities with 0 (neutral) for clients with insufficient history
df_clean = feature_df.fillna(0)
# Scaling: RobustScaler is preferred over StandardScaler for financial data
# because it is less influenced by 'Whale' clients (outliers).
scaler = RobustScaler()
scaled_data = scaler.fit_transform(df_clean)
# 2. Clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(scaled_data)
# 3. Attach labels
results = df_clean.copy()
results['Cluster'] = labels
return results, kmeans.cluster_centers_
def get_cluster_profiles(results_df):
"""Returns the average profile of each cluster."""
return results_df.groupby('Cluster').mean()