28 lines
943 B
Python
28 lines
943 B
Python
from sklearn.preprocessing import RobustScaler
|
|
from sklearn.cluster import KMeans
|
|
|
|
def run_clustering_pipeline(feature_df, n_clusters=4):
|
|
"""
|
|
Scales features and clusters clients.
|
|
"""
|
|
# 1. Preprocessing
|
|
# Fill missing sensitivities with 0 (neutral) for clients with insufficient history
|
|
df_clean = feature_df.fillna(0)
|
|
|
|
# RobustScaler over StandardScaler for financial data bc less influenced by 'Whale' clients.
|
|
scaler = RobustScaler()
|
|
scaled_data = scaler.fit_transform(df_clean)
|
|
|
|
# 2. Clustering
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
labels = kmeans.fit_predict(scaled_data)
|
|
|
|
# 3. Attach labels
|
|
results = df_clean.copy()
|
|
results['Cluster'] = labels
|
|
|
|
return results, kmeans.cluster_centers_
|
|
|
|
def get_cluster_profiles(results_df):
|
|
"""Returns the average profile of each cluster."""
|
|
return results_df.groupby('Cluster').mean() |