paco-dev #2
12620
client_clusters.csv
Normal file
12620
client_clusters.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
cluster_map.png
Normal file
BIN
cluster_map.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 278 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -21,7 +21,7 @@ def run_clustering_pipeline(feature_df, n_clusters=4):
|
|||
results = df_clean.copy()
|
||||
results['Cluster'] = labels
|
||||
|
||||
return results, kmeans.cluster_centers_
|
||||
return results, kmeans.cluster_centers_, scaler # Returns 3 items
|
||||
|
||||
def get_cluster_profiles(results_df):
|
||||
"""Returns the average profile of each cluster."""
|
||||
|
|
|
|||
21
clustering/clustering_results.txt
Normal file
21
clustering/clustering_results.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
--- Cluster Profiles (Mean Values), original regression ---
|
||||
Cluster 0 1 2
|
||||
total_subs 8.848696e+06 1.866568e+07 0.000000e+00
|
||||
total_reds -9.907930e+06 -1.866568e+07 -7.255456e+03
|
||||
net_flow_vol -1.059233e+06 0.000000e+00 -7.255456e+03
|
||||
txn_count 1.493652e+02 1.000000e+00 2.000000e+00
|
||||
tenure_days 5.199713e+02 0.000000e+00 0.000000e+00
|
||||
buy_sell_ratio 1.508150e+08 3.733136e+13 -1.000000e+00
|
||||
pct_flow_ALTERNATIVE 1.857841e-02 0.000000e+00 0.000000e+00
|
||||
pct_flow_DIVERSIFIED 1.845385e+02 0.000000e+00 1.443736e+10
|
||||
pct_flow_EQUITY 8.731666e-02 0.000000e+00 0.000000e+00
|
||||
pct_flow_FIXED INCOME -1.842562e+02 1.000000e+00 -1.443736e+10
|
||||
pct_flow_NAN 3.976915e-04 0.000000e+00 0.000000e+00
|
||||
pct_flow_PRIVATE ASSETS 9.285151e-04 0.000000e+00 0.000000e+00
|
||||
avg_aum 8.342624e+05 1.482901e+07 2.185214e+04
|
||||
aum_volatility 8.135300e+05 8.274060e+06 2.058599e+02
|
||||
Registrar Account - ID 0.000000e+00 0.000000e+00 0.000000e+00
|
||||
alpha 0.000000e+00 0.000000e+00 0.000000e+00
|
||||
beta_rate 0.000000e+00 0.000000e+00 0.000000e+00
|
||||
beta_bond 0.000000e+00 0.000000e+00 0.000000e+00
|
||||
r_squared 0.000000e+00 0.000000e+00 0.000000e+00
|
||||
|
|
@ -8,18 +8,24 @@ fs = s3fs.S3FileSystem(
|
|||
token = os.environ["AWS_SESSION_TOKEN"])
|
||||
|
||||
def load_and_clean_data(rates_path, gov_path):
|
||||
"""
|
||||
Loads raw CSVs and parses dates for consistent time-series analysis.
|
||||
"""
|
||||
# Enforce string types for IDs to prevent 'Mixed Type' warnings
|
||||
dtype_spec = {
|
||||
'Registrar Account - ID': str,
|
||||
'Company - Id': str,
|
||||
'Company - Ultimate Parent Id': str,
|
||||
'Agreement - Code': str
|
||||
}
|
||||
with fs.open('s3://projet-bdc-carmignac-g3/AUM_repaired.csv', 'rb') as f:
|
||||
aum = pd.read_csv(f, sep =",")
|
||||
aum = pd.read_csv(f, sep=",", dtype=dtype_spec)
|
||||
|
||||
with fs.open('s3://projet-bdc-carmignac-g3/flows.csv', 'rb') as f:
|
||||
flows = pd.read_csv(f, sep =",")
|
||||
flows = pd.read_csv(f, sep=",", dtype=dtype_spec)
|
||||
|
||||
flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
|
||||
aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
|
||||
|
||||
# Market data loading (Standardizing dates)
|
||||
print("Loading Market Data...")
|
||||
rates = pd.read_csv(rates_path)
|
||||
try:
|
||||
rates['Date'] = pd.to_datetime(rates['Date'], dayfirst=True)
|
||||
|
|
@ -27,6 +33,9 @@ def load_and_clean_data(rates_path, gov_path):
|
|||
rates['Date'] = pd.to_datetime(rates['Date'])
|
||||
|
||||
gov = pd.read_csv(gov_path)
|
||||
gov['Date'] = pd.to_datetime(gov['Date'])
|
||||
try:
|
||||
gov['Date'] = pd.to_datetime(gov['Date'], dayfirst=True)
|
||||
except:
|
||||
gov['Date'] = pd.to_datetime(gov['Date'])
|
||||
|
||||
return flows, aum, rates, gov
|
||||
|
|
@ -11,18 +11,20 @@ def compute_static_features(flows_df, aum_df):
|
|||
total_reds=('Value € - Redemption', 'sum'),
|
||||
net_flow_vol=('Value € - NetFlows', 'sum'),
|
||||
txn_count=('Agreement - Code', 'count'),
|
||||
# Tenure: Days between first and last activity
|
||||
tenure_days=('Centralisation Date', lambda x: (x.max() - x.min()).days)
|
||||
)
|
||||
|
||||
# Flow Ratio: -1 (Pure Seller) to +1 (Pure Buyer)
|
||||
flow_stats['buy_sell_ratio'] = (flow_stats['total_subs'] - flow_stats['total_reds']) / \
|
||||
(flow_stats['total_subs'] + flow_stats['total_reds'] + 1e-6)
|
||||
# Robust Buy/Sell Ratio
|
||||
total_vol = flow_stats['total_subs'].abs() + flow_stats['total_reds'].abs()
|
||||
flow_stats['buy_sell_ratio'] = (flow_stats['total_subs'] - flow_stats['total_reds']) / (total_vol + 1.0)
|
||||
flow_stats['buy_sell_ratio'] = flow_stats['buy_sell_ratio'].clip(-1, 1)
|
||||
|
||||
# --- 2. Product Preferences ---
|
||||
# Calculate % of flows going to each Asset Type
|
||||
asset_pivot = flows_df.groupby(['Registrar Account - ID', 'Product - Asset Type'])['Value € - Subscription'].sum().unstack(fill_value=0)
|
||||
asset_pct = asset_pivot.div(asset_pivot.sum(axis=1) + 1e-6, axis=0).add_prefix('pct_flow_')
|
||||
pos_flows = flows_df[flows_df['Value € - Subscription'] > 0]
|
||||
asset_pivot = pos_flows.groupby(['Registrar Account - ID', 'Product - Asset Type'])['Value € - Subscription'].sum().unstack(fill_value=0)
|
||||
|
||||
row_sums = asset_pivot.sum(axis=1)
|
||||
asset_pct = asset_pivot.div(row_sums + 1.0, axis=0).add_prefix('pct_flow_')
|
||||
|
||||
# --- 3. AUM Stats ---
|
||||
aum_stats = aum_df.groupby('Registrar Account - ID').agg(
|
||||
|
|
@ -30,63 +32,178 @@ def compute_static_features(flows_df, aum_df):
|
|||
aum_volatility=('Value - AUM €', 'std')
|
||||
)
|
||||
|
||||
# Merge all static features
|
||||
features = flow_stats.join(asset_pct).join(aum_stats, how='outer').fillna(0)
|
||||
return features
|
||||
|
||||
def compute_market_sensitivities(flows_df, aum_df, rates_df, gov_df, freq='M'):
|
||||
def compute_shock_sensitivities(flows_df, aum_df, rates_df, gov_df, freq='ME'):
|
||||
"""
|
||||
Computes Beta sensitivity to Rates and Gov Bonds.
|
||||
Filters clients based on Activity (>5 txns) and Wealth (>0 AUM).
|
||||
Computes sensitivity using Robust OLS + Dynamic Feature Selection.
|
||||
Only targets HIGHLY ACTIVE clients (>= 250 transactions).
|
||||
"""
|
||||
print(f"DEBUG: Computing Sensitivities (Threshold=250)...")
|
||||
|
||||
# --- 1. Prepare Market Factors ---
|
||||
# Resample Rates (Take last value of period)
|
||||
# Force Numeric Types
|
||||
rates_df['Yld to Maturity'] = pd.to_numeric(rates_df['Yld to Maturity'], errors='coerce')
|
||||
gov_df['Total Return % 1-wk-LOC'] = pd.to_numeric(gov_df['Total Return % 1-wk-LOC'], errors='coerce')
|
||||
|
||||
rates_res = rates_df.set_index('Date').resample(freq)['Yld to Maturity'].last()
|
||||
delta_rates = rates_res.diff().rename('Delta_Rate')
|
||||
delta_rates = rates_res.diff()
|
||||
|
||||
# Resample Gov Bonds (Using 'EG04' 7-10Y Euro Gov as proxy)
|
||||
gov_target = gov_df[gov_df['Bond/Index'] == 'EG04'].set_index('Date')
|
||||
gov_target = gov_target[~gov_target.index.duplicated(keep='first')] # Dedup
|
||||
# Calculate return over period
|
||||
gov_target = gov_target[~gov_target.index.duplicated(keep='first')]
|
||||
gov_res = gov_target['Total Return % 1-wk-LOC'].resample(freq).apply(lambda x: (1 + x/100).prod() - 1)
|
||||
gov_res = gov_res.rename('Bond_Return')
|
||||
|
||||
market_factors = pd.concat([delta_rates, gov_res], axis=1).dropna()
|
||||
market_df = pd.concat([delta_rates.rename('Delta_Rate'), gov_res.rename('Bond_Return')], axis=1).dropna()
|
||||
|
||||
# --- 2. Identify Eligible Clients (Funneling) ---
|
||||
# Criterion A: Wealthy enough (Mean AUM > 1000 EUR to avoid division by zero)
|
||||
# String Period Index for Robust Merging
|
||||
market_df['Period_Str'] = market_df.index.to_period(freq).astype(str)
|
||||
market_df = market_df.set_index('Period_Str')
|
||||
|
||||
# --- 2. Define Shocks ---
|
||||
rate_q1 = market_df['Delta_Rate'].quantile(0.25)
|
||||
rate_q3 = market_df['Delta_Rate'].quantile(0.75)
|
||||
bond_q1 = market_df['Bond_Return'].quantile(0.25)
|
||||
bond_q3 = market_df['Bond_Return'].quantile(0.75)
|
||||
|
||||
market_df['Rate_Spike'] = (market_df['Delta_Rate'] > rate_q3).astype(int)
|
||||
market_df['Rate_Drop'] = (market_df['Delta_Rate'] < rate_q1).astype(int)
|
||||
market_df['Bond_Rally'] = (market_df['Bond_Return'] > bond_q3).astype(int)
|
||||
market_df['Bond_Crash'] = (market_df['Bond_Return'] < bond_q1).astype(int)
|
||||
|
||||
all_shock_cols = ['Rate_Spike', 'Rate_Drop', 'Bond_Rally', 'Bond_Crash']
|
||||
|
||||
# --- 3. Funneling ---
|
||||
aum_df['Value - AUM €'] = pd.to_numeric(aum_df['Value - AUM €'], errors='coerce')
|
||||
mean_aum = aum_df.groupby('Registrar Account - ID')['Value - AUM €'].mean()
|
||||
valid_aum_clients = mean_aum[mean_aum > 1000].index
|
||||
|
||||
# Criterion B: Active enough (Transactions >= 6)
|
||||
# --- UPDATED THRESHOLD HERE ---
|
||||
txn_counts = flows_df['Registrar Account - ID'].value_counts()
|
||||
active_clients = txn_counts[txn_counts >= 6].index
|
||||
active_clients = txn_counts[txn_counts >= 250].index
|
||||
|
||||
# Intersection: Clients who are BOTH wealthy enough and active enough
|
||||
eligible_clients = list(set(valid_aum_clients) & set(active_clients))
|
||||
|
||||
print(f"Regression Funnel: {len(eligible_clients)} clients eligible out of {len(mean_aum)} total accounts.")
|
||||
print(f"Shock Model Funnel: {len(eligible_clients)} clients eligible (Active >= 250 txns).")
|
||||
|
||||
# --- 4. Regression ---
|
||||
flows_df['Period_Str'] = flows_df['Centralisation Date'].dt.to_period(freq).astype(str)
|
||||
flows_df['Quantity - NetFlows'] = pd.to_numeric(flows_df['Quantity - NetFlows'], errors='coerce')
|
||||
|
||||
client_betas = []
|
||||
success_count = 0
|
||||
failure_printed = False
|
||||
|
||||
for client in eligible_clients:
|
||||
c_flows = flows_df[flows_df['Registrar Account - ID'] == client]
|
||||
c_ts = c_flows.groupby('Period_Str')['Quantity - NetFlows'].sum()
|
||||
|
||||
merged = pd.merge(c_ts, market_df, left_index=True, right_index=True, how='inner')
|
||||
|
||||
if len(merged) >= 6:
|
||||
client_avg_wealth = mean_aum.loc[client]
|
||||
|
||||
# Skip invalid AUM
|
||||
if not np.isfinite(client_avg_wealth) or client_avg_wealth == 0:
|
||||
continue
|
||||
|
||||
Y = merged['Quantity - NetFlows'] / client_avg_wealth
|
||||
|
||||
# --- Dynamic Feature Selection ---
|
||||
# Drop shock columns that are all zeros (event never happened for this client)
|
||||
valid_cols = []
|
||||
for col in all_shock_cols:
|
||||
if merged[col].sum() > 0:
|
||||
valid_cols.append(col)
|
||||
|
||||
X = merged[valid_cols]
|
||||
X = sm.add_constant(X)
|
||||
|
||||
# Check data validity
|
||||
if Y.isna().any() or X.isna().any().any():
|
||||
if not failure_printed:
|
||||
print(f"DEBUG CRASH: Client {client} has NaNs.")
|
||||
failure_printed = True
|
||||
continue
|
||||
|
||||
try:
|
||||
model = sm.OLS(Y, X).fit()
|
||||
|
||||
result_dict = {
|
||||
'Registrar Account - ID': client,
|
||||
'alpha_normal': model.params.get('const', 0),
|
||||
'shock_r_squared': model.rsquared
|
||||
}
|
||||
# Fill missing betas with 0
|
||||
for col in all_shock_cols:
|
||||
result_dict[f'beta_{col.lower()}'] = model.params.get(col, 0)
|
||||
|
||||
client_betas.append(result_dict)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
if not failure_printed:
|
||||
print(f"DEBUG CRASH: {e}")
|
||||
failure_printed = True
|
||||
continue
|
||||
|
||||
print(f"DEBUG: Successfully modeled {success_count} clients.")
|
||||
|
||||
if not client_betas:
|
||||
return pd.DataFrame(columns=['Registrar Account - ID', 'alpha_normal',
|
||||
'beta_rate_spike', 'beta_rate_drop',
|
||||
'beta_bond_rally', 'beta_bond_crash', 'shock_r_squared'])
|
||||
|
||||
return pd.DataFrame(client_betas).set_index('Registrar Account - ID')
|
||||
|
||||
def compute_linear_sensitivities(flows_df, aum_df, rates_df, gov_df, freq='M'):
|
||||
"""
|
||||
Computes standard linear sensitivity: Flow ~ Alpha + Beta_Rate * dRate + Beta_Bond * BondRet
|
||||
"""
|
||||
print(f"DEBUG: Computing Sensitivities (Linear Model)...")
|
||||
|
||||
# 1. Prepare Market Data
|
||||
rates_df['Yld to Maturity'] = pd.to_numeric(rates_df['Yld to Maturity'], errors='coerce')
|
||||
gov_df['Total Return % 1-wk-LOC'] = pd.to_numeric(gov_df['Total Return % 1-wk-LOC'], errors='coerce')
|
||||
|
||||
rates_res = rates_df.set_index('Date').resample(freq)['Yld to Maturity'].last()
|
||||
delta_rates = rates_res.diff()
|
||||
|
||||
gov_target = gov_df[gov_df['Bond/Index'] == 'EG04'].set_index('Date')
|
||||
gov_target = gov_target[~gov_target.index.duplicated(keep='first')]
|
||||
gov_res = gov_target['Total Return % 1-wk-LOC'].resample(freq).apply(lambda x: (1 + x/100).prod() - 1)
|
||||
|
||||
market_df = pd.concat([delta_rates.rename('Delta_Rate'), gov_res.rename('Bond_Return')], axis=1).dropna()
|
||||
market_df['Period_Str'] = market_df.index.to_period(freq).astype(str)
|
||||
market_df = market_df.set_index('Period_Str')
|
||||
|
||||
# 2. Funneling
|
||||
aum_df['Value - AUM €'] = pd.to_numeric(aum_df['Value - AUM €'], errors='coerce')
|
||||
mean_aum = aum_df.groupby('Registrar Account - ID')['Value - AUM €'].mean()
|
||||
valid_aum_clients = mean_aum[mean_aum > 1000].index
|
||||
|
||||
txn_counts = flows_df['Registrar Account - ID'].value_counts()
|
||||
active_clients = txn_counts[txn_counts >= 250].index
|
||||
eligible_clients = list(set(valid_aum_clients) & set(active_clients))
|
||||
|
||||
print(f"Linear Model Funnel: {len(eligible_clients)} clients eligible.")
|
||||
|
||||
# 3. Regression
|
||||
flows_df['Period_Str'] = flows_df['Centralisation Date'].dt.to_period(freq).astype(str)
|
||||
flows_df['Quantity - NetFlows'] = pd.to_numeric(flows_df['Quantity - NetFlows'], errors='coerce')
|
||||
|
||||
# --- 3. Run Regression on Eligible Clients ---
|
||||
flows_df['Period'] = flows_df['Centralisation Date'].dt.to_period(freq).dt.to_timestamp()
|
||||
client_betas = []
|
||||
|
||||
for client in eligible_clients:
|
||||
# Get Client Flows
|
||||
c_flows = flows_df[flows_df['Registrar Account - ID'] == client]
|
||||
c_ts = c_flows.groupby('Period')['Quantity - NetFlows'].sum()
|
||||
c_ts = c_flows.groupby('Period_Str')['Quantity - NetFlows'].sum()
|
||||
|
||||
# Merge with market data
|
||||
merged = pd.concat([c_ts, market_factors], axis=1, join='inner')
|
||||
merged = pd.merge(c_ts, market_df, left_index=True, right_index=True, how='inner')
|
||||
|
||||
# Ensure we have enough data points for a valid regression
|
||||
if len(merged) >= 6:
|
||||
# Normalize Y by Client's Average AUM (Approximation of AUM_{t-1})
|
||||
# Y = NetFlow_t / Mean_AUM_i
|
||||
client_avg_wealth = mean_aum.loc[client]
|
||||
if not np.isfinite(client_avg_wealth) or client_avg_wealth == 0: continue
|
||||
|
||||
Y = merged['Quantity - NetFlows'] / client_avg_wealth
|
||||
|
||||
X = merged[['Delta_Rate', 'Bond_Return']]
|
||||
X = sm.add_constant(X)
|
||||
|
||||
|
|
@ -94,15 +211,15 @@ def compute_market_sensitivities(flows_df, aum_df, rates_df, gov_df, freq='M'):
|
|||
model = sm.OLS(Y, X).fit()
|
||||
client_betas.append({
|
||||
'Registrar Account - ID': client,
|
||||
'alpha': model.params.get('const', 0), # Intercept (Autonomous Trend)
|
||||
'beta_rate': model.params.get('Delta_Rate', 0),
|
||||
'beta_bond': model.params.get('Bond_Return', 0),
|
||||
'r_squared': model.rsquared
|
||||
'alpha_linear': model.params.get('const', 0),
|
||||
'beta_rate_linear': model.params.get('Delta_Rate', 0),
|
||||
'beta_bond_linear': model.params.get('Bond_Return', 0),
|
||||
'linear_r_squared': model.rsquared
|
||||
})
|
||||
except:
|
||||
continue
|
||||
|
||||
if not client_betas:
|
||||
return pd.DataFrame(columns=['Registrar Account - ID', 'alpha', 'beta_rate', 'beta_bond', 'r_squared'])
|
||||
return pd.DataFrame(columns=['Registrar Account - ID', 'alpha_linear', 'beta_rate_linear', 'beta_bond_linear', 'linear_r_squared'])
|
||||
|
||||
return pd.DataFrame(client_betas).set_index('Registrar Account - ID')
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pandas as pd
|
||||
from data_loader import load_and_clean_data
|
||||
from features import compute_static_features, compute_market_sensitivities
|
||||
from features import compute_static_features, compute_shock_sensitivities
|
||||
from clustering import run_clustering_pipeline, get_cluster_profiles
|
||||
|
||||
def main():
|
||||
|
|
@ -12,26 +12,37 @@ def main():
|
|||
gov_path='data/eur_gov_indices.csv'
|
||||
)
|
||||
|
||||
# 2. Feature Engineering
|
||||
print("Computing static features...")
|
||||
static_feats = compute_static_features(flows, aum)
|
||||
|
||||
print("Computing market sensitivities (Betas)...")
|
||||
# Use 'W' (Weekly) to maximize points for the sample.
|
||||
# Use 'M' (Monthly) for the full dataset.
|
||||
sensitivity_feats = compute_market_sensitivities(flows, aum, rates, gov, freq='W')
|
||||
# Option 1: Run Shock Model (Default)
|
||||
sensitivity_feats = compute_shock_sensitivities(flows, aum, rates, gov, freq='ME')
|
||||
|
||||
# Option 2: Run Linear Model (Uncomment to use)
|
||||
# sensitivity_feats = compute_linear_sensitivities(flows, aum, rates, gov, freq='ME')
|
||||
|
||||
# Merge features
|
||||
full_features = static_feats.join(sensitivity_feats, how='left')
|
||||
|
||||
# Clustering
|
||||
print(f"Running Clustering on {len(full_features)} clients...")
|
||||
clustered_df, centers = run_clustering_pipeline(full_features, n_clusters=3)
|
||||
# Fill missing sensitivities with 0 (Passive clients)
|
||||
shock_cols = ['alpha_normal', 'beta_rate_spike', 'beta_rate_drop',
|
||||
'beta_bond_rally', 'beta_bond_crash', 'shock_r_squared']
|
||||
full_features[shock_cols] = full_features[shock_cols].fillna(0)
|
||||
|
||||
print(f"Final Feature Matrix: {full_features.shape}")
|
||||
|
||||
# 3. Clustering
|
||||
print("Running Clustering...")
|
||||
clustered_df, centers, scaler = run_clustering_pipeline(full_features, n_clusters=3)
|
||||
|
||||
# 4. Results
|
||||
print("\n--- Cluster Profiles (Mean Values) ---")
|
||||
profiles = get_cluster_profiles(clustered_df)
|
||||
print(profiles.T)
|
||||
print(profiles.T)
|
||||
|
||||
clustered_df.to_csv('clustering/client_clusters.csv')
|
||||
print("\nResults saved to 'clustering/client_clusters.csv'")
|
||||
clustered_df.to_csv('client_clusters.csv')
|
||||
print("\nResults saved to 'client_clusters.csv'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
57
clustering/visualize.py
Normal file
57
clustering/visualize.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
def plot_clusters():
|
||||
print("--- Generating Cluster Visualization ---")
|
||||
|
||||
# 1. Load the results from main.py
|
||||
try:
|
||||
df = pd.read_csv('client_clusters.csv', index_col=0)
|
||||
except FileNotFoundError:
|
||||
print("Error: Run main.py first to generate 'client_clusters.csv'")
|
||||
return
|
||||
|
||||
# 2. Prepare Data for PCA
|
||||
# Drop non-numeric or ID columns if any linger (though index handled it)
|
||||
X = df.drop(columns=['Cluster'])
|
||||
|
||||
# Scale (Critical for PCA)
|
||||
scaler = RobustScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# 3. Run PCA (Reduce to 2 Dimensions)
|
||||
pca = PCA(n_components=2)
|
||||
components = pca.fit_transform(X_scaled)
|
||||
|
||||
# Create plotting DataFrame
|
||||
plot_df = pd.DataFrame(data=components, columns=['PC1', 'PC2'], index=X.index)
|
||||
plot_df['Cluster'] = df['Cluster'].astype(str) # Convert to string for discrete colors
|
||||
|
||||
# 4. Plot
|
||||
plt.figure(figsize=(12, 8))
|
||||
sns.scatterplot(
|
||||
data=plot_df,
|
||||
x='PC1',
|
||||
y='PC2',
|
||||
hue='Cluster',
|
||||
style='Cluster',
|
||||
palette='viridis',
|
||||
s=60,
|
||||
alpha=0.8
|
||||
)
|
||||
|
||||
plt.title('Client Segmentation Map (PCA Projection)', fontsize=16)
|
||||
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.1%} Variance)', fontsize=12)
|
||||
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.1%} Variance)', fontsize=12)
|
||||
plt.legend(title='Cluster ID', bbox_to_anchor=(1.05, 1), loc='upper left')
|
||||
plt.grid(True, linestyle='--', alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('cluster_map.png', dpi=300)
|
||||
print("Visualization saved to 'cluster_map.png'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
plot_clusters()
|
||||
Loading…
Reference in New Issue
Block a user