Project_Carmignac/clustering/clustering.py

from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans

def run_clustering_pipeline(feature_df, n_clusters=4):
    """
    Scales features and clusters clients.
    """
    # 1. Preprocessing
    # Fill missing sensitivities with 0 (neutral) for clients with insufficient history
    df_clean = feature_df.fillna(0)
    
    # Scaling: RobustScaler is preferred over StandardScaler for financial data 
    # because it is less influenced by 'Whale' clients (outliers).
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(df_clean)
    
    # 2. Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(scaled_data)
    
    # 3. Attach labels
    results = df_clean.copy()
    results['Cluster'] = labels
    
    return results, kmeans.cluster_centers_

def get_cluster_profiles(results_df):
    """Returns the average profile of each cluster."""
    return results_df.groupby('Cluster').mean()
Added clustering step 2026-02-02 11:37:16 +01:00			`from sklearn.preprocessing import RobustScaler`
			`from sklearn.cluster import KMeans`

			`def run_clustering_pipeline(feature_df, n_clusters=4):`
			`"""`
			`Scales features and clusters clients.`
			`"""`
			`# 1. Preprocessing`
			`# Fill missing sensitivities with 0 (neutral) for clients with insufficient history`
			`df_clean = feature_df.fillna(0)`

			`# Scaling: RobustScaler is preferred over StandardScaler for financial data`
			`# because it is less influenced by 'Whale' clients (outliers).`
			`scaler = RobustScaler()`
			`scaled_data = scaler.fit_transform(df_clean)`

			`# 2. Clustering`
			`kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)`
			`labels = kmeans.fit_predict(scaled_data)`

			`# 3. Attach labels`
			`results = df_clean.copy()`
			`results['Cluster'] = labels`

			`return results, kmeans.cluster_centers_`

			`def get_cluster_profiles(results_df):`
			`"""Returns the average profile of each cluster."""`
			`return results_df.groupby('Cluster').mean()`