from sklearn.preprocessing import RobustScaler from sklearn.cluster import KMeans def run_clustering_pipeline(feature_df, n_clusters=4): """ Scales features and clusters clients. """ # 1. Preprocessing # Fill missing sensitivities with 0 (neutral) for clients with insufficient history df_clean = feature_df.fillna(0) # Scaling: RobustScaler is preferred over StandardScaler for financial data # because it is less influenced by 'Whale' clients (outliers). scaler = RobustScaler() scaled_data = scaler.fit_transform(df_clean) # 2. Clustering kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) labels = kmeans.fit_predict(scaled_data) # 3. Attach labels results = df_clean.copy() results['Cluster'] = labels return results, kmeans.cluster_centers_ def get_cluster_profiles(results_df): """Returns the average profile of each cluster.""" return results_df.groupby('Cluster').mean()