Project_Carmignac/clustering/clustering.py

28 lines
943 B
Python
Raw Normal View History

2026-02-02 11:37:16 +01:00
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
def run_clustering_pipeline(feature_df, n_clusters=4):
"""
Scales features and clusters clients.
"""
# 1. Preprocessing
# Fill missing sensitivities with 0 (neutral) for clients with insufficient history
df_clean = feature_df.fillna(0)
2026-02-02 12:31:08 +01:00
# RobustScaler over StandardScaler for financial data bc less influenced by 'Whale' clients.
2026-02-02 11:37:16 +01:00
scaler = RobustScaler()
scaled_data = scaler.fit_transform(df_clean)
# 2. Clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(scaled_data)
# 3. Attach labels
results = df_clean.copy()
results['Cluster'] = labels
return results, kmeans.cluster_centers_
def get_cluster_profiles(results_df):
"""Returns the average profile of each cluster."""
return results_df.groupby('Cluster').mean()