29 lines
994 B
Python
29 lines
994 B
Python
|
|
from sklearn.preprocessing import RobustScaler
|
||
|
|
from sklearn.cluster import KMeans
|
||
|
|
|
||
|
|
def run_clustering_pipeline(feature_df, n_clusters=4):
|
||
|
|
"""
|
||
|
|
Scales features and clusters clients.
|
||
|
|
"""
|
||
|
|
# 1. Preprocessing
|
||
|
|
# Fill missing sensitivities with 0 (neutral) for clients with insufficient history
|
||
|
|
df_clean = feature_df.fillna(0)
|
||
|
|
|
||
|
|
# Scaling: RobustScaler is preferred over StandardScaler for financial data
|
||
|
|
# because it is less influenced by 'Whale' clients (outliers).
|
||
|
|
scaler = RobustScaler()
|
||
|
|
scaled_data = scaler.fit_transform(df_clean)
|
||
|
|
|
||
|
|
# 2. Clustering
|
||
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
||
|
|
labels = kmeans.fit_predict(scaled_data)
|
||
|
|
|
||
|
|
# 3. Attach labels
|
||
|
|
results = df_clean.copy()
|
||
|
|
results['Cluster'] = labels
|
||
|
|
|
||
|
|
return results, kmeans.cluster_centers_
|
||
|
|
|
||
|
|
def get_cluster_profiles(results_df):
|
||
|
|
"""Returns the average profile of each cluster."""
|
||
|
|
return results_df.groupby('Cluster').mean()
|