481 lines
20 KiB
Python
481 lines
20 KiB
Python
"""
|
||
feature_engineering.py
|
||
───────────────────────
|
||
Construction du dataset de features pour la modélisation prédictive.
|
||
|
||
Ce module assemble trois familles de features :
|
||
|
||
[A] Features comportementales client (depuis stocks/AUM)
|
||
- Encours actuel et lags (1m, 3m, 6m)
|
||
- Croissance de l'AUM sur différentes fenêtres
|
||
- Concentration du portefeuille client (part du fonds dans son total)
|
||
|
||
[B] Features de performance absolue (depuis weekly_perf)
|
||
- Rendements 6Mo et 1Yr du fonds détenu
|
||
- Percentile Morningstar (rang brut dans la catégorie)
|
||
|
||
[C] Features de performance relative (depuis relative_performance.py)
|
||
- Spread vs médiane des vrais peers
|
||
- Rang dans le groupe de peers restreint
|
||
- Momentum de rang, ratio d'outperformance
|
||
- Dummies top/bottom quartile (lien avec la relation convexe de Sirri & Tufano)
|
||
|
||
Variable cible :
|
||
flux_net_proxy = ΔAum(t → t+1)
|
||
→ À remplacer par les flux transactionnels bruts dès que disponibles.
|
||
|
||
Usage :
|
||
from peers_loader import PeersLoader
|
||
from relative_performance import RelativePerformanceCalculator
|
||
from feature_engineering import FeatureBuilder
|
||
|
||
loader = PeersLoader("peers/").load()
|
||
calc = RelativePerformanceCalculator(loader)
|
||
builder = FeatureBuilder(loader, calc)
|
||
|
||
dataset = builder.build(
|
||
stocks_df = stocks,
|
||
perf_df = weekly_perf,
|
||
target_lag = 1 # prédire les flux à t+1 mois
|
||
)
|
||
"""
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from typing import Optional
|
||
|
||
|
||
# ── Constantes ────────────────────────────────────────────────────────────────
|
||
|
||
# Périodes de performance à inclure comme features
|
||
PERF_PERIODS_TO_USE = ["6MoRet", "1YrRet"]
|
||
|
||
# Lags AUM (en mois)
|
||
AUM_LAGS = [1, 3, 6]
|
||
|
||
# Fenêtre de tolérance pour le merge_asof (en jours)
|
||
MERGE_ASOF_TOLERANCE_DAYS = 35
|
||
|
||
|
||
# ── Classe principale ─────────────────────────────────────────────────────────
|
||
|
||
class FeatureBuilder:
|
||
"""
|
||
Construit le dataset de modélisation en assemblant les trois familles
|
||
de features.
|
||
|
||
Paramètres
|
||
----------
|
||
loader : PeersLoader (déjà chargé)
|
||
rel_calc : RelativePerformanceCalculator
|
||
"""
|
||
|
||
def __init__(self, loader, rel_calc):
|
||
self.loader = loader
|
||
self.rel_calc = rel_calc
|
||
|
||
# ── Point d'entrée principal ──────────────────────────────────────────────
|
||
|
||
def build(self,
|
||
stocks_df: pd.DataFrame,
|
||
perf_df: pd.DataFrame,
|
||
target_lag: int = 1,
|
||
perf_periods: list[str] | None = None,
|
||
verbose: bool = True) -> pd.DataFrame:
|
||
"""
|
||
Construit le dataset final avec features et variable cible.
|
||
|
||
Paramètres
|
||
----------
|
||
stocks_df : AUM mensuels (equity_stocks_full.csv ou similaire)
|
||
perf_df : performances hebdomadaires (weekly_perf_full.csv)
|
||
target_lag : horizon de prédiction en mois (1 = flux du mois suivant)
|
||
perf_periods: périodes de perf à utiliser (défaut : PERF_PERIODS_TO_USE)
|
||
|
||
Retourne
|
||
--------
|
||
DataFrame avec une ligne par (compte × fonds × mois)
|
||
contenant toutes les features et la variable cible.
|
||
"""
|
||
if perf_periods is None:
|
||
perf_periods = [p for p in PERF_PERIODS_TO_USE
|
||
if p in perf_df["perfPeriod"].unique()]
|
||
if not perf_periods:
|
||
# Fallback : utiliser toutes les périodes disponibles
|
||
perf_periods = perf_df["perfPeriod"].unique().tolist()
|
||
|
||
if verbose:
|
||
print("── FeatureBuilder ──────────────────────────────────────")
|
||
print(f"Périodes de performance utilisées : {perf_periods}")
|
||
|
||
# Étape 1 : résolution ISIN dans perf via peers
|
||
perf_df = self._resolve_perf_isin(perf_df)
|
||
|
||
# Étape 2 : calcul des métriques relatives
|
||
if verbose:
|
||
print("\nCalcul des performances relatives...")
|
||
rel_df = self.rel_calc.compute(perf_df, perf_periods=perf_periods,
|
||
verbose=verbose)
|
||
|
||
# Étape 3 : features client (AUM)
|
||
if verbose:
|
||
print("\nConstruction des features AUM...")
|
||
stocks_feat = self._build_aum_features(stocks_df)
|
||
|
||
# Étape 4 : features de performance absolue
|
||
if verbose:
|
||
print("\nJointure des performances absolues...")
|
||
perf_abs = self._build_absolute_perf_features(perf_df, perf_periods)
|
||
|
||
# Étape 5 : jointure AUM × perf absolue
|
||
dataset = self._merge_aum_and_perf(stocks_feat, perf_abs, verbose)
|
||
|
||
# Étape 6 : jointure avec les métriques relatives
|
||
if not rel_df.empty:
|
||
if verbose:
|
||
print("\nJointure des métriques relatives...")
|
||
dataset = self._merge_relative_perf(dataset, rel_df, verbose)
|
||
|
||
# Étape 7 : variable cible
|
||
if verbose:
|
||
print(f"\nConstruction de la variable cible (lag = {target_lag} mois)...")
|
||
dataset = self._build_target(dataset, lag=target_lag)
|
||
|
||
# Étape 8 : nettoyage final
|
||
dataset = self._final_cleanup(dataset, verbose)
|
||
|
||
if verbose:
|
||
self._print_dataset_summary(dataset)
|
||
|
||
return dataset
|
||
|
||
# ── Étape 1 : résolution ISIN dans perf ──────────────────────────────────
|
||
|
||
def _resolve_perf_isin(self, perf_df: pd.DataFrame) -> pd.DataFrame:
|
||
"""Ajoute la colonne 'isin' dans perf_df via PeersLoader."""
|
||
perf_df = perf_df.copy()
|
||
perf_df["Date"] = pd.to_datetime(perf_df["Date"])
|
||
perf_df["isin"] = perf_df["shareClass_name"].apply(
|
||
self.loader.resolve_shareclass_name
|
||
)
|
||
|
||
# Ajouter la stratégie Carmignac si disponible
|
||
isin_to_strategy = dict(
|
||
zip(self.loader.carmignac_df["ISIN"],
|
||
self.loader.carmignac_df["carmignac_strategy"])
|
||
)
|
||
perf_df["carmignac_strategy"] = perf_df["isin"].map(isin_to_strategy)
|
||
return perf_df
|
||
|
||
# ── Étape 2 : features AUM ────────────────────────────────────────────────
|
||
|
||
def _build_aum_features(self, stocks_df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
Construit les features comportementales depuis les snapshots AUM mensuels.
|
||
|
||
Features produites (par compte × fonds × date) :
|
||
aum_t : encours à t
|
||
aum_lag{n} : encours à t-n mois (n ∈ AUM_LAGS)
|
||
aum_growth_{n}m : croissance relative sur n mois
|
||
aum_share_wallet : part du fonds dans le portefeuille total du compte
|
||
"""
|
||
df = stocks_df.copy()
|
||
df["Centralisation Date"] = pd.to_datetime(df["Centralisation Date"])
|
||
|
||
# Tri pour les lags
|
||
df = df.sort_values(["Registrar Account - ID", "Product - Isin",
|
||
"Centralisation Date"])
|
||
|
||
grp = df.groupby(["Registrar Account - ID", "Product - Isin"])
|
||
|
||
# Lags AUM
|
||
for lag in AUM_LAGS:
|
||
df[f"aum_lag{lag}"] = grp["Value - AUM €"].shift(lag)
|
||
|
||
# Croissances
|
||
for lag in AUM_LAGS:
|
||
df[f"aum_growth_{lag}m"] = (
|
||
(df["Value - AUM €"] - df[f"aum_lag{lag}"])
|
||
/ (df[f"aum_lag{lag}"].abs() + 1.0)
|
||
)
|
||
|
||
# Part dans le portefeuille total du compte (concentration)
|
||
total_aum_by_account = (
|
||
df.groupby(["Registrar Account - ID", "Centralisation Date"])["Value - AUM €"]
|
||
.transform("sum")
|
||
)
|
||
df["aum_share_wallet"] = df["Value - AUM €"] / (total_aum_by_account + 1.0)
|
||
|
||
# Renommage pour clarté
|
||
df = df.rename(columns={"Value - AUM €": "aum_t"})
|
||
|
||
# Colonnes à conserver
|
||
keep = (
|
||
["Registrar Account - ID", "Product - Isin", "Centralisation Date",
|
||
"Registrar Account - Region", "RegistrarAccount - Country",
|
||
"Product - Asset Type", "Product - Strategy", "Product - Fund",
|
||
"aum_t", "aum_share_wallet"]
|
||
+ [f"aum_lag{lag}" for lag in AUM_LAGS]
|
||
+ [f"aum_growth_{lag}m" for lag in AUM_LAGS]
|
||
)
|
||
keep = [c for c in keep if c in df.columns]
|
||
return df[keep]
|
||
|
||
# ── Étape 3 : features de performance absolue ─────────────────────────────
|
||
|
||
def _build_absolute_perf_features(self, perf_df: pd.DataFrame,
|
||
perf_periods: list[str]) -> pd.DataFrame:
|
||
"""
|
||
Pivote weekly_perf pour obtenir une ligne par (isin, date)
|
||
avec une colonne par (période × métrique).
|
||
|
||
Colonnes produites : perf_return_6MoRet, perf_pct_1YrRet, etc.
|
||
"""
|
||
relevant = perf_df[perf_df["perfPeriod"].isin(perf_periods)].copy()
|
||
if relevant.empty:
|
||
return pd.DataFrame(columns=["isin", "Date"])
|
||
|
||
pivoted = relevant.pivot_table(
|
||
index=["isin", "Date"],
|
||
columns="perfPeriod",
|
||
values=["return", "percentile"],
|
||
aggfunc="mean"
|
||
)
|
||
# Aplatir les colonnes multi-index
|
||
pivoted.columns = [
|
||
f"perf_{metric}_{period}"
|
||
for metric, period in pivoted.columns
|
||
]
|
||
return pivoted.reset_index()
|
||
|
||
# ── Étape 4 : jointure AUM × perf absolue ────────────────────────────────
|
||
|
||
def _merge_aum_and_perf(self, stocks_feat: pd.DataFrame,
|
||
perf_abs: pd.DataFrame,
|
||
verbose: bool) -> pd.DataFrame:
|
||
"""
|
||
merge_asof temporel : pour chaque snapshot mensuel AUM,
|
||
trouve la performance hebdomadaire la plus récente ≤ date snapshot.
|
||
"""
|
||
if perf_abs.empty:
|
||
if verbose:
|
||
print(" ⚠ Aucune performance absolue à joindre.")
|
||
return stocks_feat
|
||
|
||
merged_parts = []
|
||
for isin_val in stocks_feat["Product - Isin"].unique():
|
||
s = stocks_feat[stocks_feat["Product - Isin"] == isin_val].sort_values(
|
||
"Centralisation Date")
|
||
p = perf_abs[perf_abs["isin"] == isin_val].sort_values("Date")
|
||
|
||
if p.empty:
|
||
merged_parts.append(s)
|
||
continue
|
||
|
||
m = pd.merge_asof(
|
||
s, p,
|
||
left_on="Centralisation Date",
|
||
right_on="Date",
|
||
direction="backward",
|
||
tolerance=pd.Timedelta(f"{MERGE_ASOF_TOLERANCE_DAYS}d")
|
||
)
|
||
merged_parts.append(m)
|
||
|
||
result = pd.concat(merged_parts, ignore_index=True)
|
||
perf_cols_joined = [c for c in result.columns if c.startswith("perf_")]
|
||
|
||
if verbose:
|
||
n_matched = result[perf_cols_joined[0]].notna().sum() if perf_cols_joined else 0
|
||
print(f" {n_matched}/{len(result)} lignes avec performance jointe "
|
||
f"({len(perf_cols_joined)} colonnes)")
|
||
|
||
return result
|
||
|
||
# ── Étape 5 : jointure métriques relatives ────────────────────────────────
|
||
|
||
def _merge_relative_perf(self, dataset: pd.DataFrame,
|
||
rel_df: pd.DataFrame,
|
||
verbose: bool) -> pd.DataFrame:
|
||
"""
|
||
Joint les métriques relatives sur (carmignac_strategy, date).
|
||
|
||
Stratégie de jointure :
|
||
- La stratégie Carmignac d'un compte est déduite depuis
|
||
Product - Strategy (nom court) ou ISIN via peers_loader.
|
||
- merge_asof temporel avec tolérance ±35j.
|
||
"""
|
||
# Récupérer la stratégie Carmignac depuis l'ISIN
|
||
isin_to_strategy = dict(
|
||
zip(self.loader.carmignac_df["ISIN"],
|
||
self.loader.carmignac_df["carmignac_strategy"])
|
||
)
|
||
dataset["carmignac_strategy"] = dataset["Product - Isin"].map(isin_to_strategy)
|
||
|
||
# Agréger rel_df sur toutes périodes (moyenne par stratégie × date)
|
||
rel_cols = [c for c in rel_df.columns if c.startswith("rel_")]
|
||
if not rel_cols:
|
||
return dataset
|
||
|
||
rel_agg = (rel_df
|
||
.groupby(["carmignac_strategy", "Date"])[rel_cols]
|
||
.mean()
|
||
.reset_index()
|
||
.sort_values(["carmignac_strategy", "Date"]))
|
||
|
||
merged_parts = []
|
||
for strat in dataset["carmignac_strategy"].dropna().unique():
|
||
d_strat = dataset[dataset["carmignac_strategy"] == strat].sort_values(
|
||
"Centralisation Date")
|
||
r_strat = rel_agg[rel_agg["carmignac_strategy"] == strat].sort_values("Date")
|
||
|
||
if r_strat.empty:
|
||
merged_parts.append(d_strat)
|
||
continue
|
||
|
||
m = pd.merge_asof(
|
||
d_strat, r_strat,
|
||
left_on="Centralisation Date",
|
||
right_on="Date",
|
||
direction="backward",
|
||
tolerance=pd.Timedelta(f"{MERGE_ASOF_TOLERANCE_DAYS}d")
|
||
)
|
||
merged_parts.append(m)
|
||
|
||
# Ajouter les comptes sans stratégie identifiée
|
||
no_strat = dataset[dataset["carmignac_strategy"].isna()]
|
||
if not no_strat.empty:
|
||
merged_parts.append(no_strat)
|
||
|
||
result = pd.concat(merged_parts, ignore_index=True)
|
||
|
||
if verbose:
|
||
n_rel = result[rel_cols[0]].notna().sum() if rel_cols else 0
|
||
print(f" {n_rel}/{len(result)} lignes avec métriques relatives jointes "
|
||
f"({len(rel_cols)} colonnes)")
|
||
|
||
return result
|
||
|
||
# ── Étape 6 : variable cible ──────────────────────────────────────────────
|
||
|
||
def _build_target(self, dataset: pd.DataFrame, lag: int) -> pd.DataFrame:
|
||
"""
|
||
Construit la variable cible : ΔAum(t → t+lag).
|
||
|
||
flux_net_proxy = aum(t+lag) - aum(t)
|
||
|
||
Note : Dans un contexte de production, remplacer par :
|
||
flux_net = sum(souscriptions[t:t+lag]) - sum(rachats[t:t+lag])
|
||
depuis le fichier de transactions quotidiennes.
|
||
"""
|
||
dataset = dataset.sort_values(
|
||
["Registrar Account - ID", "Product - Isin", "Centralisation Date"]
|
||
)
|
||
grp = dataset.groupby(["Registrar Account - ID", "Product - Isin"])
|
||
|
||
dataset["aum_next"] = grp["aum_t"].shift(-lag)
|
||
dataset["flux_net_proxy"] = dataset["aum_next"] - dataset["aum_t"]
|
||
|
||
# Feature supplémentaire : flux relatif (normalisé par l'AUM)
|
||
dataset["flux_net_relative"] = (
|
||
dataset["flux_net_proxy"] / (dataset["aum_t"].abs() + 1.0)
|
||
)
|
||
|
||
return dataset
|
||
|
||
# ── Étape 7 : nettoyage final ─────────────────────────────────────────────
|
||
|
||
def _final_cleanup(self, dataset: pd.DataFrame,
|
||
verbose: bool) -> pd.DataFrame:
|
||
"""
|
||
Supprime les doublons de colonnes, retire les lignes sans cible,
|
||
log les taux de remplissage.
|
||
"""
|
||
# Supprimer les colonnes en double (artefacts du merge)
|
||
dataset = dataset.loc[:, ~dataset.columns.duplicated()]
|
||
|
||
# Retirer les lignes sans variable cible
|
||
n_before = len(dataset)
|
||
dataset = dataset.dropna(subset=["flux_net_proxy"])
|
||
n_after = len(dataset)
|
||
|
||
if verbose and n_before > n_after:
|
||
print(f" Lignes supprimées (cible manquante) : {n_before - n_after}")
|
||
|
||
return dataset.reset_index(drop=True)
|
||
|
||
# ── Résumé ────────────────────────────────────────────────────────────────
|
||
|
||
def _print_dataset_summary(self, dataset: pd.DataFrame):
|
||
feature_cols = self.get_feature_columns(dataset)
|
||
print("\n── Dataset final ───────────────────────────────────────")
|
||
print(f"Lignes : {len(dataset):,}")
|
||
print(f"Colonnes totales : {len(dataset.columns)}")
|
||
print(f"Features : {len(feature_cols)}")
|
||
print(f"\nTaux de remplissage des features :")
|
||
|
||
families = {
|
||
"AUM": [c for c in feature_cols if c.startswith("aum_")],
|
||
"Perf abs": [c for c in feature_cols if c.startswith("perf_")],
|
||
"Perf rel": [c for c in feature_cols if c.startswith("rel_")],
|
||
}
|
||
for family, cols in families.items():
|
||
if cols:
|
||
fill_rates = dataset[cols].notna().mean()
|
||
avg_fill = fill_rates.mean()
|
||
print(f" {family:<12} ({len(cols):2d} cols) : {avg_fill:.1%} en moyenne")
|
||
|
||
print(f"\nVariable cible 'flux_net_proxy' :")
|
||
t = dataset["flux_net_proxy"]
|
||
print(f" Médiane : {t.median():+,.0f} €")
|
||
print(f" Std : {t.std():,.0f} €")
|
||
print(f" % positif (souscription nette) : {(t > 0).mean():.1%}")
|
||
print(f" % négatif (rachat net) : {(t < 0).mean():.1%}")
|
||
print("─────────────────────────────────────────────────────────")
|
||
|
||
# ── API publique ──────────────────────────────────────────────────────────
|
||
|
||
@staticmethod
|
||
def get_feature_columns(dataset: pd.DataFrame) -> list[str]:
|
||
"""Retourne la liste des colonnes de features (exclut les métadonnées et la cible)."""
|
||
exclude = {
|
||
"Registrar Account - ID", "Product - Isin", "Centralisation Date",
|
||
"Product - Fund", "Product - Strategy", "Product - Asset Type",
|
||
"Registrar Account - Region", "RegistrarAccount - Country",
|
||
"carmignac_strategy", "isin", "Date",
|
||
"aum_next", "flux_net_proxy", "flux_net_relative", "shareClass_name",
|
||
}
|
||
return [c for c in dataset.columns if c not in exclude
|
||
and dataset[c].dtype in [np.float64, np.int64, float, int]]
|
||
|
||
@staticmethod
|
||
def get_target_column() -> str:
|
||
return "flux_net_proxy"
|
||
|
||
|
||
# ── Usage autonome ────────────────────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
from peers_loader import PeersLoader
|
||
from relative_performance import RelativePerformanceCalculator
|
||
|
||
peers_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
||
stocks_path = sys.argv[2] if len(sys.argv) > 2 else "equity_stocks_head.csv"
|
||
perf_path = sys.argv[3] if len(sys.argv) > 3 else "weekly_perf_head.csv"
|
||
|
||
loader = PeersLoader(peers_dir=peers_dir).load()
|
||
calc = RelativePerformanceCalculator(loader)
|
||
builder = FeatureBuilder(loader, calc)
|
||
|
||
stocks_df = pd.read_csv(stocks_path)
|
||
perf_df = pd.read_csv(perf_path)
|
||
|
||
dataset = builder.build(stocks_df, perf_df)
|
||
|
||
print("\nAperçu du dataset :")
|
||
feature_cols = FeatureBuilder.get_feature_columns(dataset)
|
||
print(dataset[feature_cols].describe().round(3).to_string())
|
||
|
||
dataset.to_csv("dataset_features.csv", index=False)
|
||
print("\nDataset sauvegardé dans dataset_features.csv")
|