Project_Carmignac/clustering/clustering.py

from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans

def run_clustering_pipeline(feature_df, n_clusters=4):
    """
    Scales features and clusters clients.
    """
    # 1. Preprocessing
    # Fill missing sensitivities with 0 (neutral) for clients with insufficient history
    df_clean = feature_df.fillna(0)

    # RobustScaler over StandardScaler for financial data bc less influenced by 'Whale' clients.
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(df_clean)

    # 2. Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(scaled_data)

    # 3. Attach labels
    results = df_clean.copy()
    results['Cluster'] = labels

    return results, kmeans.cluster_centers_

def get_cluster_profiles(results_df):
    """Returns the average profile of each cluster."""
    return results_df.groupby('Cluster').mean()